diff --git a/.cirrus.yml b/.cirrus.yml index 6c2baf8a0..b4c4870d0 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -29,7 +29,7 @@ task: - mkdir build - cd build - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. - - make + - make -j 4 task: name: AppleM1/GCC/MAKE/OPENMP diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml index 4a9bf98b6..f1bf8064c 100644 --- a/.github/workflows/loongarch64.yml +++ b/.github/workflows/loongarch64.yml @@ -16,13 +16,13 @@ jobs: include: - target: LOONGSONGENERIC triple: loongarch64-unknown-linux-gnu - opts: NO_SHARED=1 TARGET=LOONGSONGENERIC + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC - target: LOONGSON3R5 triple: loongarch64-unknown-linux-gnu - opts: NO_SHARED=1 TARGET=LOONGSON3R5 + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5 - target: LOONGSON2K1000 triple: loongarch64-unknown-linux-gnu - opts: NO_SHARED=1 TARGET=LOONGSON2K1000 + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 - target: DYNAMIC_ARCH triple: loongarch64-unknown-linux-gnu opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC @@ -40,8 +40,9 @@ jobs: - name: Download and install loongarch64-toolchain run: | - wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz - tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt + wget https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/download/8.1/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz + #wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz + tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt - name: Set env run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b65871f8..f3eac2edf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 25) +set(OpenBLAS_PATCH_VERSION 25.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") @@ -249,20 +249,21 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago endif() endif() -if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS) +# Seems that this hack doesn't required since macOS 11 Big Sur +if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) if (NOT NOFORTRAN) set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) set (CMAKE_Fortran_CREATE_SHARED_LIBRARY - "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " - "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " + "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " + "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") else () set (CMAKE_C_CREATE_SHARED_LIBRARY - "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " - "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " + "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " + "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") endif () endif() @@ -541,7 +542,7 @@ if(NOT NO_LAPACKE) ADD_CUSTOM_TARGET(genlapacke COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" ) - install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) + install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() # Install pkg-config files diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 71df13634..493747052 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -216,3 +216,6 @@ In chronological order: * Pablo Romero * [2022-08] Fix building from sources for QNX + +* Mark Seminatore + * [2023-11-09] Improve Windows threading performance scaling \ No newline at end of file diff --git a/Changelog.txt b/Changelog.txt index e0fe0ca5a..b6139d6b7 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,49 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.26 + 2-Jan-2024 + +general: +- improved the version of openblas.pc that is created by the CMAKE build +- fixed a CMAKE-specific build problem on older versions of MacOS +- worked around linking problems on old versions of MacOS +- corrected installation location of the lapacke_mangling header in CMAKE builds +- added type declarations for complex variables to the MSVC-specific parts of the LAPACK header +- significantly sped up ?GESV for small problem sizes by introducing a lower bound for multithreading +- imported additions and corrections from the Reference-LAPACK project: + - added new LAPACK functions for truncated QR with pivoting (Reference-LAPACK PRs 891&941) + - handle miscalculation of minimum work array size in corner cases (Reference-LAPACK PR 942) + - fixed use of uninitialized variables in ?GEDMD and improved inline documentation (PR 959) + - fixed use of uninitialized variables (and consequential failures) in ?BBCSD (PR 967) + - added tests for the recently introduced Dynamic Mode Decomposition functions (PR 736) + - fixed several memory leaks in the LAPACK testsuite (PR 953) + - fixed counting of testsuite results by the Python script (PR 954) + +x86-64: +- fixed computation of CASUM on SkylakeX and newer targets in the special + case that AVX512 is not supported by the compiler or operating environment +- fixed potential undefined behaviour in the CASUM/ZASUM kernels for AVX512 targets +- worked around a problem in the pre-AVX kernels for GEMV +- sped up the thread management code on MS Windows + +arm64: +- fixed building of the LAPACK testsuite with Xcode 15 on Apple M1 and newer +- sped up the thread management code on MS Windows +- sped up SGEMM and DGEMM on Neoverse V1 and N1 +- sped up ?DOT on SVE-capable targets +- reduced the number of targets in DYNAMIC_ARCH builds by eliminating functionally equivalent ones +- included support for Apple M1 and newer targets in DYNAMIC_ARCH builds + +power: +- improved the SGEMM kernel for POWER10 +- fixed compilation with (very) old versions of gcc +- fixed detection of old 32bit PPC targets in CMAKE-based builds +- added autodetection of the POWERPC 7400 subtype +- fixed CMAKE-based compilation for PPCG4 and PPC970 targets + +loongarch64: +- added and improved optimized kernels for almost all BLAS functions + ==================================================================== Version 0.3.25 12-Nov-2023 diff --git a/GotoBLAS_06WeirdPerformance.txt b/GotoBLAS_06WeirdPerformance.txt index 05766e17b..0f7cec5c9 100644 --- a/GotoBLAS_06WeirdPerformance.txt +++ b/GotoBLAS_06WeirdPerformance.txt @@ -11,7 +11,7 @@ operation is finished. -2. Simlar problem may happen under virtual machine. If supervisor +2. Similar problem may happen under virtual machine. If supervisor allocates different cores for each scheduling, BLAS performnace will be bad. This is because BLAS also utilizes all cache, unexpected re-schedule for different core may result of heavy diff --git a/Makefile.power b/Makefile.power index 95bada34f..3fa6d6faf 100644 --- a/Makefile.power +++ b/Makefile.power @@ -11,7 +11,19 @@ endif ifeq ($(CORE), POWER10) ifneq ($(C_COMPILER), PGI) +ifeq ($(C_COMPILER), GCC) +ifeq ($(GCCVERSIONGTEQ10), 1) CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math +else ifneq ($(GCCVERSIONGT4), 1) +$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) +CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math +else +$(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended) +CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math +endif +else +CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math +endif ifeq ($(F_COMPILER), IBM) FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize else diff --git a/Makefile.rule b/Makefile.rule index fd44b1b12..58f02358e 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.25 +VERSION = 0.3.25.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/Makefile.system b/Makefile.system index 1b84195e4..e602eaf05 100644 --- a/Makefile.system +++ b/Makefile.system @@ -407,6 +407,7 @@ XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables |awk '/vers endif ifeq (x$(XCVER), x 15) CCOMMON_OPT += -Wl,-ld_classic +FCOMMON_OPT += -Wl,-ld_classic endif endif @@ -676,16 +677,12 @@ ifeq ($(ARCH), arm64) DYNAMIC_CORE = ARMV8 DYNAMIC_CORE += CORTEXA53 DYNAMIC_CORE += CORTEXA57 -DYNAMIC_CORE += CORTEXA72 -DYNAMIC_CORE += CORTEXA73 DYNAMIC_CORE += NEOVERSEN1 ifneq ($(NO_SVE), 1) DYNAMIC_CORE += NEOVERSEV1 DYNAMIC_CORE += NEOVERSEN2 DYNAMIC_CORE += ARMV8SVE endif -DYNAMIC_CORE += CORTEXA55 -DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 diff --git a/README.md b/README.md index 3c4e38f18..b8d66ed42 100644 --- a/README.md +++ b/README.md @@ -196,20 +196,22 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th ```sh make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran ``` - (also known to work on C906) + (also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision) ### Support for multiple targets in a single library OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. -For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default. +For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX, Cooper Lake, Sapphire Rapids. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default. `DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias, Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano. -On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. +On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. If compiler support for SVE is available at build time, support for NeoverseN2, NeoverseV1 as well as generic ArmV8SVE targets is also enabled. -For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14. +For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additionally available if a sufficiently recent compiler is used for the build. + +on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support. The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the common code in the library, usually you will want to set this to the oldest model you expect to encounter. diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 317bc504a..3ae8615a7 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -288,9 +288,9 @@ jobs: vmImage: 'ubuntu-latest' steps: - script: | - wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \ - && echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc alpine-chroot-install' | sha1sum -c \ - || exit 1 + wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.14.0/alpine-chroot-install \ + && echo 'ccbf65f85cdc351851f8ad025bb3e65bae4d5b06 alpine-chroot-install' | sha1sum -c \ + || exit 1 alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 diff --git a/benchmark/trsv.c b/benchmark/trsv.c index 66ac3a3c7..e17c57157 100644 --- a/benchmark/trsv.c +++ b/benchmark/trsv.c @@ -127,7 +127,7 @@ int main(int argc, char *argv[]){ long long muls = n*(n+1)/2.0; long long adds = (n - 1.0)*n/2.0; - fprintf(stderr, "%10d %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg); + fprintf(stderr, "%10d : %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg); if(a != NULL){ free(a); } diff --git a/c_check b/c_check index b018c10a8..b5e4a9ad0 100755 --- a/c_check +++ b/c_check @@ -199,8 +199,7 @@ if [ "$architecture" = "loongarch64" ]; then tmpd="$(mktemp -d)" tmplsx="$tmpd/lsx.c" codelsx='"vadd.b $vr0, $vr0, $vr0"' - lsx_flags='-march=loongarch64 -mlsx' - printf "#include \n\n" >> "$tmplsx" + lsx_flags='-march=loongarch64' printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx" args="$lsx_flags -o $tmplsx.o $tmplsx" { @@ -211,8 +210,7 @@ if [ "$architecture" = "loongarch64" ]; then tmplasx="$tmpd/lasx.c" codelasx='"xvadd.b $xr0, $xr0, $xr0"' - lasx_flags='-march=loongarch64 -mlasx' - printf "#include \n\n" >> "$tmplasx" + lasx_flags='-march=loongarch64' printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx" args="$lasx_flags -o $tmplasx.o $tmplasx" { diff --git a/c_check.pl b/c_check.pl index 7a860a211..d9c36793c 100644 --- a/c_check.pl +++ b/c_check.pl @@ -241,8 +241,7 @@ if (($architecture eq "loongarch64")) { } else { $tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); $codelsx = '"vadd.b $vr0, $vr0, $vr0"'; - $lsx_flags = "-march=loongarch64 -mlsx"; - print $tmplsx "#include \n\n"; + $lsx_flags = "-march=loongarch64"; print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n"; $args = "$lsx_flags -o $tmplsx.o $tmplsx"; @@ -257,8 +256,7 @@ if (($architecture eq "loongarch64")) { $tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); $codelasx = '"xvadd.b $xr0, $xr0, $xr0"'; - $lasx_flags = "-march=loongarch64 -mlasx"; - print $tmplasx "#include \n\n"; + $lasx_flags = "-march=loongarch64"; print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n"; $args = "$lasx_flags -o $tmplasx.o $tmplasx"; diff --git a/cmake/arch.cmake b/cmake/arch.cmake index ebdc5a833..eb974456b 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,7 +44,7 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE) endif () diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 7b4ef8947..2da941afb 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -36,9 +36,19 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS if (LOONGARCH64) if (BINARY64) - set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64") + CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) + if(COMPILER_SUPPORT_LP64D_ABI) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64d") + else() + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64") + endif () else () - set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32") + CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) + if(COMPILER_SUPPORT_ILP32D_ABI) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=ilp32d") + else() + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32") + endif () endif () set(BINARY_DEFINED 1) endif () @@ -282,6 +292,27 @@ if (${CORE} STREQUAL POWER8) endif () endif () +# With -mcpu=970 added it compiles, but library is broken, at least on macOS. If someone +# tests on *BSD or Linux and adds this flag, please make sure it is not used for macOS case. +if (${CORE} STREQUAL PPC970) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -mtune=970 -maltivec -fno-fast-math") + endif () + if (APPLE) + set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL") + endif () +endif () + +# -mcpu=G4 seems to work fine, but perhaps avoid it for the sake of consistency? +if (${CORE} STREQUAL PPCG4) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -mtune=G4 -maltivec -fno-fast-math") + endif () + if (APPLE) + set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL") + endif () +endif () + if (NOT DYNAMIC_ARCH) if (HAVE_AVX2) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2") diff --git a/cmake/fc.cmake b/cmake/fc.cmake index c496f6368..5c30be843 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -61,9 +61,19 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F endif () if (LOONGARCH64) if (BINARY64) - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") + CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) + if(COMPILER_SUPPORT_LP64D_ABI) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d") + else() + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") + endif () else () - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") + CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) + if(COMPILER_SUPPORT_ILP32D_ABI) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d") + else() + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") + endif () endif () endif () if (RISCV64) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 22476f561..003a8b3c1 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -52,7 +52,7 @@ set(SLASRC sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f sgels.f sgelsd.f sgelss.f sgelsy.f sgeql2.f sgeqlf.f - sgeqp3.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f + sgeqp3.f sgeqp3rk.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f sgetc2.f sgetrf2.f sgetri.f sggbak.f sggbal.f @@ -67,7 +67,7 @@ set(SLASRC slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f slansy.f slantb.f slantp.f slantr.f slanv2.f slapll.f slapmt.f - slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f + slaqgb.f slaqge.f slaqp2.f slaqps.f slaqp2rk.f slaqp3rk.f slaqsb.f slaqsp.f slaqsy.f slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f @@ -139,7 +139,7 @@ set(CLASRC cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f - cgels.f cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f + cgels.f cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f cgeqp3rk.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f cgerq2.f cgerqf.f cgesc2.f cgesdd.f cgesvd.f cgesvdx.f cgesvj.f cgejsv.f cgsvj0.f cgsvj1.f @@ -173,7 +173,7 @@ set(CLASRC clanhb.f clanhe.f clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f - claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f + claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqp2rk.f claqp3rk.f claqsb.f claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f claqz0.f claqz1.f claqz2.f claqz3.f claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f @@ -243,7 +243,7 @@ set(DLASRC dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f dgels.f dgelsd.f dgelss.f dgelsy.f dgeql2.f dgeqlf.f - dgeqp3.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f + dgeqp3.f dgeqp3rk.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f dgetc2.f dgetrf2.f dgetri.f dggbak.f dggbal.f @@ -258,7 +258,7 @@ set(DLASRC dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f dlapll.f dlapmt.f - dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f + dlaqgb.f dlaqge.f dlaqp2.f dlaqp2rk.f dlaqp3rk.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f @@ -331,7 +331,7 @@ set(ZLASRC zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f - zgels.f zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f + zgels.f zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f zgeqp3rk.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvx.f zgesvj.f zgejsv.f zgsvj0.f zgsvj1.f @@ -367,7 +367,7 @@ set(ZLASRC zlanhe.f zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f - zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f + zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqp2rk.f zlaqp3rk.f zlaqps.f zlaqsb.f zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f @@ -557,7 +557,7 @@ set(SLASRC sgebrd.c sgecon.c sgeequ.c sgees.c sgeesx.c sgeev.c sgeevx.c sgehd2.c sgehrd.c sgelq2.c sgelqf.c sgels.c sgelsd.c sgelss.c sgelsy.c sgeql2.c sgeqlf.c - sgeqp3.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c + sgeqp3.c sgeqp3rk.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c sgesc2.c sgesdd.c sgesvd.c sgesvdx.c sgesvx.c sgetc2.c sgetrf2.c sgetri.c sggbak.c sggbal.c @@ -571,7 +571,7 @@ set(SLASRC slangb.c slange.c slangt.c slanhs.c slansb.c slansp.c slansy.c slantb.c slantp.c slantr.c slanv2.c slapll.c slapmt.c - slaqgb.c slaqge.c slaqp2.c slaqps.c slaqsb.c slaqsp.c slaqsy.c + slaqgb.c slaqge.c slaqp2.c slaqp2rk.c slaqp3rk.c slaqps.c slaqsb.c slaqsp.c slaqsy.c slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c slarf.c slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c @@ -643,7 +643,7 @@ set(CLASRC cgbtf2.c cgbtrf.c cgbtrs.c cgebak.c cgebal.c cgebd2.c cgebrd.c cgecon.c cgeequ.c cgees.c cgeesx.c cgeev.c cgeevx.c cgehd2.c cgehrd.c cgelq2.c cgelqf.c - cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c + cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c cgeqp3rk.c cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c cgesc2.c cgesdd.c cgesvd.c cgesvdx.c cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c @@ -677,7 +677,7 @@ set(CLASRC clanhb.c clanhe.c clanhp.c clanhs.c clanht.c clansb.c clansp.c clansy.c clantb.c clantp.c clantr.c clapll.c clapmt.c clarcm.c claqgb.c claqge.c - claqhb.c claqhe.c claqhp.c claqp2.c claqps.c claqsb.c + claqhb.c claqhe.c claqhp.c claqp2.c claqp2rk.c claqp3rk.c claqps.c claqsb.c claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c clarf.c clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c @@ -746,7 +746,7 @@ set(DLASRC dgebrd.c dgecon.c dgeequ.c dgees.c dgeesx.c dgeev.c dgeevx.c dgehd2.c dgehrd.c dgelq2.c dgelqf.c dgels.c dgelsd.c dgelss.c dgelsy.c dgeql2.c dgeqlf.c - dgeqp3.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c + dgeqp3.c dgeqp3rk.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c dgesc2.c dgesdd.c dgesvd.c dgesvdx.c dgesvx.c dgetc2.c dgetrf2.c dgetri.c dggbak.c dggbal.c @@ -760,7 +760,7 @@ set(DLASRC dlangb.c dlange.c dlangt.c dlanhs.c dlansb.c dlansp.c dlansy.c dlantb.c dlantp.c dlantr.c dlanv2.c dlapll.c dlapmt.c - dlaqgb.c dlaqge.c dlaqp2.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c + dlaqgb.c dlaqge.c dlaqp2.c dlaqp2rk.c dlaqp3rk.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c dlarf.c dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c @@ -833,7 +833,7 @@ set(ZLASRC zgbtf2.c zgbtrf.c zgbtrs.c zgebak.c zgebal.c zgebd2.c zgebrd.c zgecon.c zgeequ.c zgees.c zgeesx.c zgeev.c zgeevx.c zgehd2.c zgehrd.c zgelq2.c zgelqf.c - zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c + zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c zgeqp3rk.c zgeqr2.c zgeqr2p.c zgeqrf.c zgeqrfp.c zgerfs.c zgerq2.c zgerqf.c zgesc2.c zgesdd.c zgesvd.c zgesvdx.c zgesvx.c zgesvj.c zgejsv.c zgsvj0.c zgsvj1.c @@ -868,7 +868,7 @@ set(ZLASRC zlanhe.c zlanhp.c zlanhs.c zlanht.c zlansb.c zlansp.c zlansy.c zlantb.c zlantp.c zlantr.c zlapll.c zlapmt.c zlaqgb.c zlaqge.c - zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqps.c zlaqsb.c + zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqp2rk.c zlaqp3rk.c zlaqps.c zlaqsb.c zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c zlarcm.c zlarf.c zlarfb.c zlarfb_gett.c diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index 7e120af86..11e5606e5 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -5,7 +5,7 @@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ openblas_config=USE_64BITINT=@INTERFACE64@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version -Version: @OPENBLAS_VERSION@ -URL: https://github.com/xianyi/OpenBLAS +Version: @OpenBLAS_VERSION@ +URL: https://github.com/OpenMathLib/OpenBLAS Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} Cflags: -I${includedir} diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 49b9863e3..e94497a04 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -38,7 +38,7 @@ if(CMAKE_CL_64 OR MINGW64) endif() elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING)) set(X86 1) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc.*")) set(POWER 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") set(MIPS64 1) @@ -46,7 +46,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") set(LOONGARCH64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*") set(RISCV64 1) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*")) if (NOT BINARY) if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") set(X86_64 1) @@ -109,7 +109,7 @@ else() endif () if (NOT BINARY) - if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64) + if (X86_64 OR ARM64 OR MIPS64 OR LOONGARCH64 OR RISCV64 OR (POWER AND NOT (CMAKE_OSX_ARCHITECTURES STREQUAL "ppc"))) set(BINARY 64) else () set(BINARY 32) diff --git a/common_loongarch64.h b/common_loongarch64.h index ce1fcf091..b1426da79 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -119,12 +119,50 @@ static inline int WhereAmI(void){ #define MOV fmov.d #define CMOVT fsel #define MTC movgr2fr.d +#define MTG movfr2gr.d #define FABS fabs.d +#define FMIN fmin.d +#define FMINA fmina.d +#define FMAX fmax.d +#define FMAXA fmaxa.d #define CMPEQ fcmp.ceq.d #define CMPLE fcmp.cle.d #define CMPLT fcmp.clt.d #define NEG fneg.d +#define FFINT ffint.d.l + +#define XVFSUB xvfsub.d +#define XVFADD xvfadd.d +#define XVFMUL xvfmul.d +#define XVFMADD xvfmadd.d +#define XVFMIN xvfmin.d +#define XVFMINA xvfmina.d +#define XVFMAX xvfmax.d +#define XVFMAXA xvfmaxa.d +#define XVCMPEQ xvfcmp.ceq.d +#define XVCMPLE xvfcmp.cle.d +#define XVCMPLT xvfcmp.clt.d +#define XVMUL xvfmul.d +#define XVMSUB xvfmsub.d +#define XVNMSUB xvfnmsub.d + +#define VFSUB vfsub.d +#define VFADD vfadd.d +#define VFMUL vfmul.d +#define VFMADD vfmadd.d +#define VFMIN vfmin.d +#define VFMINA vfmina.d +#define VFMAX vfmax.d +#define VFMAXA vfmaxa.d +#define VCMPEQ vfcmp.ceq.d +#define VCMPLE vfcmp.cle.d +#define VCMPLT vfcmp.clt.d +#define VMUL vfmul.d +#define VMSUB vfmsub.d +#define VNMSUB vfnmsub.d + #else + #define LD fld.s #define ST fst.s #define MADD fmadd.s @@ -137,11 +175,48 @@ static inline int WhereAmI(void){ #define MOV fmov.s #define CMOVT fsel #define MTC movgr2fr.w +#define MTG movfr2gr.s #define FABS fabs.s +#define FMIN fmin.s +#define FMINA fmina.s +#define FMAX fmax.s +#define FMAXA fmaxa.s #define CMPEQ fcmp.ceq.s #define CMPLE fcmp.cle.s #define CMPLT fcmp.clt.s #define NEG fneg.s +#define FFINT ffint.s.l + +#define XVFSUB xvfsub.s +#define XVFADD xvfadd.s +#define XVFMUL xvfmul.s +#define XVFMADD xvfmadd.s +#define XVFMIN xvfmin.s +#define XVFMINA xvfmina.s +#define XVFMAX xvfmax.s +#define XVFMAXA xvfmaxa.s +#define XVCMPEQ xvfcmp.ceq.s +#define XVCMPLE xvfcmp.cle.s +#define XVCMPLT xvfcmp.clt.s +#define XVMUL xvfmul.s +#define XVMSUB xvfmsub.s +#define XVNMSUB xvfnmsub.s + +#define VFSUB vfsub.s +#define VFADD vfadd.s +#define VFMUL vfmul.s +#define VFMADD vfmadd.s +#define VFMIN vfmin.s +#define VFMINA vfmina.s +#define VFMAX vfmax.s +#define VFMAXA vfmaxa.s +#define VCMPEQ vfcmp.ceq.s +#define VCMPLE vfcmp.cle.s +#define VCMPLT vfcmp.clt.s +#define VMUL vfmul.s +#define VMSUB vfmsub.s +#define VNMSUB vfnmsub.s + #endif /* defined(DOUBLE) */ #if defined(__64BIT__) && defined(USE64BITINT) diff --git a/common_thread.h b/common_thread.h index 6e18d2a8e..9e7dae74a 100644 --- a/common_thread.h +++ b/common_thread.h @@ -111,8 +111,9 @@ typedef struct blas_queue { struct blas_queue *next; #if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__) - CRITICAL_SECTION lock; - HANDLE finish; + // CRITICAL_SECTION lock; + // HANDLE finish; + volatile int finished; #else pthread_mutex_t lock; pthread_cond_t finished; diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c index 7c389db27..0ad32ae4e 100644 --- a/cpuid_loongarch64.c +++ b/cpuid_loongarch64.c @@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_LOONGSON3R5 1 #define CPU_LOONGSON2K1000 2 -#define LA_HWCAP_LSX (1<<4) -#define LA_HWCAP_LASX (1<<5) +#define LA_HWCAP_LSX (1U << 4) +#define LA_HWCAP_LASX (1U << 5) static char *cpuname[] = { "LOONGSONGENERIC", @@ -64,11 +64,11 @@ static char *cpuname_lower[] = { int detect(void) { #ifdef __linux - int flag = (int)getauxval(AT_HWCAP); + int hwcap = (int)getauxval(AT_HWCAP); - if (flag & LA_HWCAP_LASX) + if (hwcap & LA_HWCAP_LASX) return CPU_LOONGSON3R5; - else if (flag & LA_HWCAP_LSX) + else if (hwcap & LA_HWCAP_LSX) return CPU_LOONGSON2K1000; else return CPU_GENERIC; @@ -94,7 +94,9 @@ void get_subdirname(void) { } void get_cpuconfig(void) { + uint32_t hwcaps = 0; int d = detect(); + switch (d) { case CPU_LOONGSON3R5: printf("#define LOONGSON3R5\n"); @@ -129,6 +131,10 @@ void get_cpuconfig(void) { printf("#define L2_ASSOCIATIVE 16\n"); break; } + + hwcaps = (uint32_t)getauxval( AT_HWCAP ); + if (hwcaps & LA_HWCAP_LSX) printf("#define HAVE_LSX\n"); + if (hwcaps & LA_HWCAP_LASX) printf("#define HAVE_LASX\n"); } void get_libname(void){ diff --git a/cpuid_power.c b/cpuid_power.c index 2526e8d0e..1ced8930a 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -160,6 +160,7 @@ int detect(void){ infoCount = HOST_BASIC_INFO_COUNT; host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, &infoCount); + if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7400) return CPUTYPE_PPCG4; if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7450) return CPUTYPE_PPCG4; if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970) return CPUTYPE_PPC970; diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 5bdfc1276..40ff85abc 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -51,15 +51,10 @@ /* This is a thread implementation for Win32 lazy implementation */ /* Thread server common information */ -typedef struct{ - CRITICAL_SECTION lock; - HANDLE filled; - HANDLE killed; - blas_queue_t *queue; /* Parameter Pointer */ - int shutdown; /* server shutdown flag */ - -} blas_pool_t; +static blas_queue_t *work_queue = NULL; +static HANDLE kickoff_event = NULL; +static CRITICAL_SECTION queue_lock; /* We need this global for checking if initialization is finished. */ int blas_server_avail = 0; @@ -67,11 +62,19 @@ int blas_server_avail = 0; /* Local Variables */ static BLASULONG server_lock = 0; -static blas_pool_t pool; static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; +static volatile int thread_target; // target num of live threads, volatile for cross-thread reads - +#if defined (__GNUC__) && (__GNUC__ < 6) + #define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch) +#else + #if defined(_WIN64) + #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp) + #else + #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp) + #endif +#endif static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ @@ -202,14 +205,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ static DWORD WINAPI blas_thread_server(void *arg){ /* Thread identifier */ -#ifdef SMP_DEBUG BLASLONG cpu = (BLASLONG)arg; -#endif void *buffer, *sa, *sb; blas_queue_t *queue; - DWORD action; - HANDLE handles[] = {pool.filled, pool.killed}; /* Each server needs each buffer */ buffer = blas_memory_alloc(2); @@ -225,29 +224,44 @@ static DWORD WINAPI blas_thread_server(void *arg){ #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu); #endif + // event raised when work is added to the queue + WaitForSingleObject(kickoff_event, INFINITE); - do { - action = WaitForMultipleObjects(2, handles, FALSE, INFINITE); - } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1)); - - if (action == WAIT_OBJECT_0 + 1) break; + if (cpu > thread_target - 2) + { + //printf("thread [%d] exiting.\n", cpu); + break; // excess thread, so worker thread exits + } #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Got it.\n", cpu); #endif - EnterCriticalSection(&pool.lock); +#if 1 + EnterCriticalSection(&queue_lock); + + queue = work_queue; + if (queue) + work_queue = work_queue->next; + + LeaveCriticalSection(&queue_lock); +#else + volatile blas_queue_t* queue_next; - queue = pool.queue; - if (queue) pool.queue = queue->next; + INT_PTR prev_value; + do { + queue = (volatile blas_queue_t*)work_queue; + if (!queue) + break; - LeaveCriticalSection(&pool.lock); + queue_next = (volatile blas_queue_t*)queue->next; + prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue); + } while (prev_value != queue); +#endif if (queue) { int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; - if (pool.queue) SetEvent(pool.filled); - sa = queue -> sa; sb = queue -> sb; @@ -331,14 +345,9 @@ static DWORD WINAPI blas_thread_server(void *arg){ #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); #endif + + queue->finished = 1; - EnterCriticalSection(&queue->lock); - - queue -> status = BLAS_STATUS_FINISHED; - - LeaveCriticalSection(&queue->lock); - - SetEvent(queue->finish); } /* Shutdown procedure */ @@ -366,15 +375,16 @@ int blas_thread_init(void){ #endif if (!blas_server_avail){ + // create the kickoff Event + kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); - InitializeCriticalSection(&pool.lock); - pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); - pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); + thread_target = blas_cpu_number; - pool.shutdown = 0; - pool.queue = NULL; + InitializeCriticalSection(&queue_lock); for(i = 0; i < blas_cpu_number - 1; i++){ + //printf("thread_init: creating thread [%d]\n", i); + blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, 0, &blas_threads_id[i]); @@ -409,8 +419,6 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ current = queue; while (current) { - InitializeCriticalSection(¤t -> lock); - current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL); current -> position = pos; #ifdef CONSISTENT_FPCSR @@ -418,23 +426,32 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode)); #endif + current->finished = 0; current = current -> next; pos ++; } - EnterCriticalSection(&pool.lock); + EnterCriticalSection(&queue_lock); + + if (!work_queue) + { + work_queue = queue; + } + else + { + blas_queue_t *next_item = work_queue; + + // find the end of the work queue + while (next_item) + next_item = next_item->next; - if (pool.queue) { - current = pool.queue; - while (current -> next) current = current -> next; - current -> next = queue; - } else { - pool.queue = queue; + // add new work to the end + next_item = queue; } - LeaveCriticalSection(&pool.lock); + LeaveCriticalSection(&queue_lock); - SetEvent(pool.filled); + SetEvent(kickoff_event); return 0; } @@ -449,21 +466,26 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ #ifdef SMP_DEBUG fprintf(STDERR, "Waiting Queue ..\n"); #endif + while (!queue->finished) + YIELDING; - WaitForSingleObject(queue->finish, INFINITE); - - CloseHandle(queue->finish); - DeleteCriticalSection(&queue -> lock); - - queue = queue -> next; - num --; + queue = queue->next; + num--; } #ifdef SMP_DEBUG fprintf(STDERR, "Completely Done.\n\n"); #endif + // if work was added to the queue after this batch we can't sleep the worker threads + // by resetting the event + EnterCriticalSection(&queue_lock); - return 0; + if (work_queue == NULL) + ResetEvent(kickoff_event); + + LeaveCriticalSection(&queue_lock); + + return 0; } /* Execute Threads */ @@ -512,8 +534,6 @@ int BLASFUNC(blas_thread_shutdown)(void){ if (blas_server_avail){ - SetEvent(pool.killed); - for(i = 0; i < blas_num_threads - 1; i++){ // Could also just use WaitForMultipleObjects DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); @@ -528,9 +548,6 @@ int BLASFUNC(blas_thread_shutdown)(void){ CloseHandle(blas_threads[i]); } - CloseHandle(pool.filled); - CloseHandle(pool.killed); - blas_server_avail = 0; } @@ -552,23 +569,48 @@ void goto_set_num_threads(int num_threads) if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; + if (blas_server_avail && num_threads < blas_num_threads) { + LOCK_COMMAND(&server_lock); + + thread_target = num_threads; + + SetEvent(kickoff_event); + + for (i = num_threads - 1; i < blas_num_threads - 1; i++) { + //printf("set_num_threads: waiting on thread [%d] to quit.\n", i); + + WaitForSingleObject(blas_threads[i], INFINITE); + + //printf("set_num_threads: thread [%d] has quit.\n", i); + + CloseHandle(blas_threads[i]); + } + + blas_num_threads = num_threads; + + ResetEvent(kickoff_event); + + UNLOCK_COMMAND(&server_lock); + } + if (num_threads > blas_num_threads) { LOCK_COMMAND(&server_lock); + thread_target = num_threads; + //increased_threads = 1; if (!blas_server_avail){ + // create the kickoff Event + kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); - InitializeCriticalSection(&pool.lock); - pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); - pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); + InitializeCriticalSection(&queue_lock); - pool.shutdown = 0; - pool.queue = NULL; blas_server_avail = 1; } for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ + //printf("set_num_threads: creating thread [%d]\n", i); blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 530d18115..803e0b5eb 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -122,10 +122,11 @@ extern gotoblas_t gotoblas_CORTEXA55; #endif #else extern gotoblas_t gotoblas_CORTEXA53; +#define gotoblas_CORTEXA55 gotoblas_CORTEXA53 extern gotoblas_t gotoblas_CORTEXA57; -extern gotoblas_t gotoblas_CORTEXA72; -extern gotoblas_t gotoblas_CORTEXA73; -extern gotoblas_t gotoblas_FALKOR; +#define gotoblas_CORTEXA72 gotoblas_CORTEXA57 +#define gotoblas_CORTEXA73 gotoblas_CORTEXA57 +#define gotoblas_FALKOR gotoblas_CORTEXA57 extern gotoblas_t gotoblas_THUNDERX; extern gotoblas_t gotoblas_THUNDERX2T99; extern gotoblas_t gotoblas_TSV110; @@ -141,7 +142,6 @@ extern gotoblas_t gotoblas_ARMV8SVE; #define gotoblas_ARMV8SVE gotoblas_ARMV8 #endif extern gotoblas_t gotoblas_THUNDERX3T110; -extern gotoblas_t gotoblas_CORTEXA55; #endif extern void openblas_warning(int verbose, const char * msg); @@ -247,6 +247,10 @@ static gotoblas_t *get_coretype(void) { int implementer, variant, part, arch, revision, midr_el1; char coremsg[128]; +#if defined (OS_DARWIN) + return &gotoblas_NEOVERSEN1; +#endif + #if (!defined OS_LINUX && !defined OS_ANDROID) return NULL; #else @@ -352,6 +356,9 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_FALKOR; } break; + case 0x61: // Apple + return &gotoblas_NEOVERSEN1; + break; default: snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); openblas_warning(1, coremsg); diff --git a/driver/others/dynamic_loongarch64.c b/driver/others/dynamic_loongarch64.c index 52f8bcb2f..44de59669 100644 --- a/driver/others/dynamic_loongarch64.c +++ b/driver/others/dynamic_loongarch64.c @@ -25,6 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ +#include #include "common.h" extern gotoblas_t gotoblas_LOONGSON3R5; @@ -74,21 +75,15 @@ static gotoblas_t *force_coretype(char *coretype) { return NULL; } -#define LASX_MASK 1<<7 -#define LSX_MASK 1<<6 -#define LOONGARCH_CFG2 0x02 +#define LA_HWCAP_LSX (1U << 4) +#define LA_HWCAP_LASX (1U << 5) static gotoblas_t *get_coretype(void) { - int ret = 0; - __asm__ volatile ( - "cpucfg %0, %1 \n\t" - : "+&r"(ret) - : "r"(LOONGARCH_CFG2) - ); - - if (ret & LASX_MASK) + int hwcap = (int)getauxval(AT_HWCAP); + + if (hwcap & LA_HWCAP_LASX) return &gotoblas_LOONGSON3R5; - else if (ret & LSX_MASK) + else if (hwcap & LA_HWCAP_LSX) return &gotoblas_LOONGSON2K1000; else return &gotoblas_LOONGSONGENERIC; diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index f0faf2baf..0454f186c 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -66,8 +66,7 @@ static int cpuid(void) #endif return CPU_UNKNOWN; } -#else -#if defined(C_PGI) || defined(__clang__) +#elif defined(C_PGI) || defined(__clang__) /* * NV HPC compilers do not yet implement __builtin_cpu_is(). * Fake a version here for use in the CPU detection code below. @@ -196,13 +195,21 @@ static int cpuid(void) cpu_type = pvrPOWER[i].cpu_type; return (int)(cpu_type); } -#endif /* C_PGI */ +#elif !defined(__BUILTIN_CPU_SUPPORTS__) +static int cpuid(void) +{ + return CPU_UNKNOWN; +} #endif /* _AIX */ #ifndef __BUILTIN_CPU_SUPPORTS__ #include -#if defined(_AIX) || (defined(__has_builtin) && !__has_builtin(__builtin_cpu_is)) +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +#if defined(_AIX) || !__has_builtin(__builtin_cpu_is) static int __builtin_cpu_is(const char *arg) { static int ipinfo = -1; @@ -227,7 +234,7 @@ static int __builtin_cpu_is(const char *arg) } #endif -#if defined(_AIX) || (defined(__has_builtin) && !__has_builtin(__builtin_cpu_supports)) +#if defined(_AIX) || !__has_builtin(__builtin_cpu_supports) static int __builtin_cpu_supports(const char *arg) { return 0; diff --git a/interface/lapack/gesv.c b/interface/lapack/gesv.c index 175350329..546c2bed2 100644 --- a/interface/lapack/gesv.c +++ b/interface/lapack/gesv.c @@ -114,7 +114,14 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, #ifdef SMP args.common = NULL; - args.nthreads = num_cpu_avail(4); +#ifndef DOUBLE + if (args.m*args.n < 40000) +#else + if (args.m*args.n < 10000) +#endif + args.nthreads=1; + else + args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index ccbce27e1..bc5999097 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -1,206 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = trsm_kernel_LN_sve.c -STRSMKERNEL_LT = trsm_kernel_LT_sve.c -STRSMKERNEL_RN = trsm_kernel_RN_sve.c -STRSMKERNEL_RT = trsm_kernel_RT_sve.c - -DTRSMKERNEL_LN = trsm_kernel_LN_sve.c -DTRSMKERNEL_LT = trsm_kernel_LT_sve.c -DTRSMKERNEL_RN = trsm_kernel_RN_sve.c -DTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -TRSMCOPYLN_M = trsm_lncopy_sve.c -TRSMCOPYLT_M = trsm_ltcopy_sve.c -TRSMCOPYUN_M = trsm_uncopy_sve.c -TRSMCOPYUT_M = trsm_utcopy_sve.c - -CTRSMKERNEL_LN = trsm_kernel_LN_sve.c -CTRSMKERNEL_LT = trsm_kernel_LT_sve.c -CTRSMKERNEL_RN = trsm_kernel_RN_sve.c -CTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c -ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c -ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c -ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c -ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c -ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c -ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c - - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = daxpy_thunderx2t99.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - -SASUMKERNEL = sasum_thunderx2t99.c -DASUMKERNEL = dasum_thunderx2t99.c -CASUMKERNEL = casum_thunderx2t99.c -ZASUMKERNEL = zasum_thunderx2t99.c - -SCOPYKERNEL = copy_thunderx2t99.c -DCOPYKERNEL = copy_thunderx2t99.c -CCOPYKERNEL = copy_thunderx2t99.c -ZCOPYKERNEL = copy_thunderx2t99.c - -SSWAPKERNEL = swap_thunderx2t99.S -DSWAPKERNEL = swap_thunderx2t99.S -CSWAPKERNEL = swap_thunderx2t99.S -ZSWAPKERNEL = swap_thunderx2t99.S - -ISAMAXKERNEL = iamax_thunderx2t99.c -IDAMAXKERNEL = iamax_thunderx2t99.c -ICAMAXKERNEL = izamax_thunderx2t99.c -IZAMAXKERNEL = izamax_thunderx2t99.c - -SNRM2KERNEL = scnrm2_thunderx2t99.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c - -DDOTKERNEL = dot.c -SDOTKERNEL = dot.c -CDOTKERNEL = zdot_thunderx2t99.c -ZDOTKERNEL = zdot_thunderx2t99.c -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S - -SGEMMINCOPY = gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c -SGEMMITCOPY = gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c -SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S -SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S - -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -STRMMUNCOPY_M = trmm_uncopy_sve_v1.c -STRMMLNCOPY_M = trmm_lncopy_sve_v1.c -STRMMUTCOPY_M = trmm_utcopy_sve_v1.c -STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -SSYMMUCOPY_M = symm_ucopy_sve.c -SSYMMLCOPY_M = symm_lcopy_sve.c - -DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S - -DGEMMINCOPY = gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c -DGEMMITCOPY = gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c -DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c -DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c -DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -DSYMMUCOPY_M = symm_ucopy_sve.c -DSYMMLCOPY_M = symm_lcopy_sve.c - -CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c -CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c -CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -CHEMMLTCOPY_M = zhemm_ltcopy_sve.c -CHEMMUTCOPY_M = zhemm_utcopy_sve.c - -CSYMMUCOPY_M = zsymm_ucopy_sve.c -CSYMMLCOPY_M = zsymm_lcopy_sve.c - -ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c -ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c -ZHEMMUTCOPY_M = zhemm_utcopy_sve.c - -ZSYMMUCOPY_M = zsymm_ucopy_sve.c -ZSYMMLCOPY_M = zsymm_lcopy_sve.c +include $(KERNELDIR)/KERNEL.ARMV8SVE diff --git a/kernel/arm64/KERNEL.CORTEXA55 b/kernel/arm64/KERNEL.CORTEXA55 index e2e006770..574e98b8c 100644 --- a/kernel/arm64/KERNEL.CORTEXA55 +++ b/kernel/arm64/KERNEL.CORTEXA55 @@ -1,196 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S -else -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -endif -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) -ifeq ($(SGEMM_UNROLL_M), 16) -SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S -else -SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -endif -ifeq ($(SGEMM_UNROLL_M), 4) -SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S -else -SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c -endif -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S -SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S - -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) - -ifeq ($(DGEMM_UNROLL_M), 8) -DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S -DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S -else -DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c -DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c -endif - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S -else -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c -endif - -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +include $(KERNELDIR)/KERNEL.CORTEXA53 diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110 index 41cedc851..5d3bd69f7 100644 --- a/kernel/arm64/KERNEL.THUNDERX3T110 +++ b/kernel/arm64/KERNEL.THUNDERX3T110 @@ -1,184 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = daxpy_thunderx2t99.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) -SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c -SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c -SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S - -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) - -ifeq ($(DGEMM_UNROLL_M), 8) -DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S -DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S -else -DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c -DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c -endif - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S -else -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c -endif - -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -SASUMKERNEL = sasum_thunderx2t99.c -DASUMKERNEL = dasum_thunderx2t99.c -CASUMKERNEL = casum_thunderx2t99.c -ZASUMKERNEL = zasum_thunderx2t99.c - -SCOPYKERNEL = copy_thunderx2t99.c -DCOPYKERNEL = copy_thunderx2t99.c -CCOPYKERNEL = copy_thunderx2t99.c -ZCOPYKERNEL = copy_thunderx2t99.c - -SSWAPKERNEL = swap_thunderx2t99.S -DSWAPKERNEL = swap_thunderx2t99.S -CSWAPKERNEL = swap_thunderx2t99.S -ZSWAPKERNEL = swap_thunderx2t99.S - -ISAMAXKERNEL = iamax_thunderx2t99.c -IDAMAXKERNEL = iamax_thunderx2t99.c -ICAMAXKERNEL = izamax_thunderx2t99.c -IZAMAXKERNEL = izamax_thunderx2t99.c - -SNRM2KERNEL = scnrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c -#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c - - -DDOTKERNEL = dot.c -SDOTKERNEL = dot.c -CDOTKERNEL = zdot_thunderx2t99.c -ZDOTKERNEL = zdot_thunderx2t99.c -DSDOTKERNEL = dot.S - -ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) -DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S -endif - -ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) -SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S -endif - -ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) -CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S -endif - -ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) -ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S -endif +include $(KERNELDIR)/KERNEL.THUNDERX2T99 diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c index 9c057551e..16f4cd537 100644 --- a/kernel/arm64/dot_kernel_sve.c +++ b/kernel/arm64/dot_kernel_sve.c @@ -1,4 +1,5 @@ /*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project Copyright (c) 2022, Arm Ltd All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,37 +31,84 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #ifdef DOUBLE -#define SVE_TYPE svfloat64_t -#define SVE_ZERO svdup_f64(0.0) -#define SVE_WHILELT svwhilelt_b64 -#define SVE_ALL svptrue_b64() -#define SVE_WIDTH svcntd() +#define DTYPE "d" +#define WIDTH "d" +#define SHIFT "3" #else -#define SVE_TYPE svfloat32_t -#define SVE_ZERO svdup_f32(0.0) -#define SVE_WHILELT svwhilelt_b32 -#define SVE_ALL svptrue_b32() -#define SVE_WIDTH svcntw() +#define DTYPE "s" +#define WIDTH "w" +#define SHIFT "2" #endif -static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) { - SVE_TYPE acc_a = SVE_ZERO; - SVE_TYPE acc_b = SVE_ZERO; +#define COUNT \ +" cnt"WIDTH" x9 \n" +#define SETUP_TRUE \ +" ptrue p0."DTYPE" \n" +#define OFFSET_INPUTS \ +" add x12, %[X_], x9, lsl #"SHIFT" \n" \ +" add x13, %[Y_], x9, lsl #"SHIFT" \n" +#define TAIL_WHILE \ +" whilelo p1."DTYPE", x8, x0 \n" +#define UPDATE(pg, x,y,out) \ +" ld1"WIDTH" { z2."DTYPE" }, "pg"/z, ["x", x8, lsl #"SHIFT"] \n" \ +" ld1"WIDTH" { z3."DTYPE" }, "pg"/z, ["y", x8, lsl #"SHIFT"] \n" \ +" fmla "out"."DTYPE", "pg"/m, z2."DTYPE", z3."DTYPE" \n" +#define SUM_VECTOR(v) \ +" faddv "DTYPE""v", p0, z"v"."DTYPE" \n" +#define RET \ +" fadd %"DTYPE"[RET_], "DTYPE"1, "DTYPE"0 \n" - BLASLONG sve_width = SVE_WIDTH; +#define DOT_KERNEL \ + COUNT \ +" mov z1.d, #0 \n" \ +" mov z0.d, #0 \n" \ +" mov x8, #0 \n" \ +" movi d1, #0x0 \n" \ + SETUP_TRUE \ +" neg x10, x9, lsl #1 \n" \ +" ands x11, x10, x0 \n" \ +" b.eq 2f // skip_2x \n" \ + OFFSET_INPUTS \ +"1: // vector_2x \n" \ + UPDATE("p0", "%[X_]", "%[Y_]", "z1") \ + UPDATE("p0", "x12", "x13", "z0") \ +" sub x8, x8, x10 \n" \ +" cmp x8, x11 \n" \ +" b.lo 1b // vector_2x \n" \ + SUM_VECTOR("1") \ +"2: // skip_2x \n" \ +" neg x10, x9 \n" \ +" and x10, x10, x0 \n" \ +" cmp x8, x10 \n" \ +" b.hs 4f // tail \n" \ +"3: // vector_1x \n" \ + UPDATE("p0", "%[X_]", "%[Y_]", "z0") \ +" add x8, x8, x9 \n" \ +" cmp x8, x10 \n" \ +" b.lo 3b // vector_1x \n" \ +"4: // tail \n" \ +" cmp x10, x0 \n" \ +" b.eq 5f // end \n" \ + TAIL_WHILE \ + UPDATE("p1", "%[X_]", "%[Y_]", "z0") \ +"5: // end \n" \ + SUM_VECTOR("0") \ + RET - for (BLASLONG i = 0; i < n; i += sve_width * 2) { - svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n); - svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n); +static +FLOAT +dot_kernel_sve(BLASLONG n, FLOAT* x, FLOAT* y) +{ + FLOAT ret; - SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); - SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); - SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); - SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]); + asm(DOT_KERNEL + : + [RET_] "=&w" (ret) + : + [N_] "r" (n), + [X_] "r" (x), + [Y_] "r" (y) + :); - acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a); - acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b); - } - - return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b); + return ret; } diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 new file mode 100644 index 000000000..c365e9a75 --- /dev/null +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -0,0 +1,110 @@ +ifndef NO_LSX + +SDOTKERNEL = dot_lsx.S +DSDOTKERNEL = dot_lsx.S +DDOTKERNEL = dot_lsx.S +CDOTKERNEL = cdot_lsx.S +ZDOTKERNEL = cdot_lsx.S + +SSCALKERNEL = scal_lsx.S +DSCALKERNEL = scal_lsx.S +CSCALKERNEL = cscal_lsx.S +ZSCALKERNEL = cscal_lsx.S + +SAMAXKERNEL = amax_lsx.S +DAMAXKERNEL = amax_lsx.S +CAMAXKERNEL = camax_lsx.S + +SAMINKERNEL = amin_lsx.S +DAMINKERNEL = amin_lsx.S +CAMINKERNEL = camin_lsx.S + +SMAXKERNEL = max_lsx.S +DMAXKERNEL = max_lsx.S + +SMINKERNEL = min_lsx.S +DMINKERNEL = min_lsx.S + +ISMAXKERNEL = imax_lsx.S +IDMAXKERNEL = imax_lsx.S + +ISMINKERNEL = imin_lsx.S +IDMINKERNEL = imin_lsx.S + +ISAMAXKERNEL = iamax_lsx.S +IDAMAXKERNEL = iamax_lsx.S +ICAMAXKERNEL = icamax_lsx.S +IZAMAXKERNEL = icamax_lsx.S + +ISAMINKERNEL = iamin_lsx.S +IDAMINKERNEL = iamin_lsx.S +ICAMINKERNEL = icamin_lsx.S +IZAMINKERNEL = icamin_lsx.S + +SCOPYKERNEL = copy_lsx.S +DCOPYKERNEL = copy_lsx.S +CCOPYKERNEL = ccopy_lsx.S +ZCOPYKERNEL = ccopy_lsx.S + +SSWAPKERNEL = swap_lsx.S +DSWAPKERNEL = swap_lsx.S + +SAXPYKERNEL = axpy_lsx.S +DAXPYKERNEL = axpy_lsx.S +CAXPYKERNEL = caxpy_lsx.S +ZAXPYKERNEL = caxpy_lsx.S + +SAXPBYKERNEL = axpby_lsx.S +DAXPBYKERNEL = axpby_lsx.S + +SSUMKERNEL = sum_lsx.S +DSUMKERNEL = sum_lsx.S + +SASUMKERNEL = asum_lsx.S +DASUMKERNEL = asum_lsx.S +CASUMKERNEL = casum_lsx.S +ZASUMKERNEL = casum_lsx.S + +SROTKERNEL = rot_lsx.S +DROTKERNEL = rot_lsx.S +CROTKERNEL = crot_lsx.S +ZROTKERNEL = crot_lsx.S + +SNRM2KERNEL = snrm2_lsx.S +DNRM2KERNEL = dnrm2_lsx.S +CNRM2KERNEL = cnrm2_lsx.S +ZNRM2KERNEL = znrm2_lsx.S + +CSWAPKERNEL = cswap_lsx.S +ZSWAPKERNEL = cswap_lsx.S + +CSUMKERNEL = csum_lsx.S +ZSUMKERNEL = csum_lsx.S + +DGEMMKERNEL = dgemm_kernel_8x4.S +DGEMMINCOPY = dgemm_ncopy_8_lsx.S +DGEMMITCOPY = dgemm_tcopy_8_lsx.S +DGEMMONCOPY = dgemm_ncopy_4_lsx.S +DGEMMOTCOPY = dgemm_tcopy_4_lsx.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CGEMMKERNEL = cgemm_kernel_2x2_lsx.S +CGEMMONCOPY = cgemm_ncopy_2_lsx.S +CGEMMOTCOPY = cgemm_tcopy_2_lsx.S +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 011e8b89e..68360faaf 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -1,4 +1,86 @@ ifndef NO_LASX + +SDOTKERNEL = dot_lasx.S +DSDOTKERNEL = dot_lasx.S +DDOTKERNEL = dot_lasx.S +CDOTKERNEL = cdot_lasx.S +ZDOTKERNEL = cdot_lasx.S + +SSCALKERNEL = scal_lasx.S +DSCALKERNEL = scal_lasx.S +CSCALKERNEL = cscal_lasx.S +ZSCALKERNEL = cscal_lasx.S + +SAMAXKERNEL = amax_lasx.S +DAMAXKERNEL = amax_lasx.S +CAMAXKERNEL = camax_lasx.S + +SAMINKERNEL = amin_lasx.S +DAMINKERNEL = amin_lasx.S +CAMINKERNEL = camin_lasx.S + +SMAXKERNEL = max_lsx.S +DMAXKERNEL = max_lsx.S + +SMINKERNEL = min_lsx.S +DMINKERNEL = min_lsx.S + +ISMAXKERNEL = imax_lasx.S +IDMAXKERNEL = imax_lasx.S + +ISMINKERNEL = imin_lasx.S +IDMINKERNEL = imin_lasx.S + +ISAMAXKERNEL = iamax_lasx.S +IDAMAXKERNEL = iamax_lasx.S +ICAMAXKERNEL = icamax_lasx.S +IZAMAXKERNEL = icamax_lasx.S + +ISAMINKERNEL = iamin_lasx.S +IDAMINKERNEL = iamin_lasx.S +ICAMINKERNEL = icamin_lasx.S +IZAMINKERNEL = icamin_lasx.S + +SCOPYKERNEL = copy_lasx.S +DCOPYKERNEL = copy_lasx.S +CCOPYKERNEL = ccopy_lasx.S +ZCOPYKERNEL = ccopy_lasx.S + +SSWAPKERNEL = swap_lasx.S +DSWAPKERNEL = swap_lasx.S + +SAXPYKERNEL = axpy_lasx.S +DAXPYKERNEL = axpy_lasx.S +CAXPYKERNEL = caxpy_lasx.S +ZAXPYKERNEL = caxpy_lasx.S + +SAXPBYKERNEL = axpby_lasx.S +DAXPBYKERNEL = axpby_lasx.S + +SSUMKERNEL = sum_lasx.S +DSUMKERNEL = sum_lasx.S + +SASUMKERNEL = asum_lasx.S +DASUMKERNEL = asum_lasx.S +CASUMKERNEL = casum_lasx.S +ZASUMKERNEL = casum_lasx.S + +SROTKERNEL = rot_lasx.S +DROTKERNEL = rot_lasx.S +CROTKERNEL = crot_lasx.S +ZROTKERNEL = crot_lasx.S + +SNRM2KERNEL = snrm2_lasx.S +DNRM2KERNEL = dnrm2_lasx.S +CNRM2KERNEL = cnrm2_lasx.S +ZNRM2KERNEL = znrm2_lasx.S + +CSWAPKERNEL = cswap_lasx.S +ZSWAPKERNEL = cswap_lasx.S + +CSUMKERNEL = csum_lasx.S +ZSUMKERNEL = csum_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S @@ -25,13 +107,35 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMVNKERNEL = sgemv_n_8_lasx.S SGEMVTKERNEL = sgemv_t_8_lasx.S +CGEMMKERNEL = cgemm_kernel_2x2_lsx.S +CGEMMONCOPY = cgemm_ncopy_2_lsx.S +CGEMMOTCOPY = cgemm_tcopy_2_lsx.S +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZGEMMKERNEL = zgemm_kernel_2x2_lasx.S +ZGEMMONCOPY = zgemm_ncopy_2_lasx.S +ZGEMMOTCOPY = zgemm_tcopy_2_lasx.S +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + DTRSMKERNEL_LN = dtrsm_kernel_LN_16x4_lasx.S DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_lasx.S DTRSMKERNEL_RN = dtrsm_kernel_RN_16x4_lasx.S DTRSMKERNEL_RT = dtrsm_kernel_RT_16x4_lasx.S -endif STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif diff --git a/kernel/loongarch64/amax_lasx.S b/kernel/loongarch64/amax_lasx.S new file mode 100644 index 000000000..e964d4ddb --- /dev/null +++ b/kernel/loongarch64/amax_lasx.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $xr0 +#define VM1 $xr1 +#define VM2 $xr2 +#define VX0 $xr3 +#define VX1 $xr4 +#define VX2 $xr5 +#define VX3 $xr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + xvldrepl.d VM0, X, 0 +#else + xvldrepl.w VM0, X, 0 +#endif + XVFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 4 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvld VX2, X, 64 + xvld VX3, X, 96 + addi.d I, I, -1 + addi.d X, X, 128 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM0, VM1 + XVFMAXA VM0, VM0, VM2 +#else + xvld VX0, X, 0 + xvld VX1, X, 32 + addi.d I, I, -1 + addi.d X, X, 64 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMAXA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 0x1 + XVFMAXA VM0, VM0, VM1 + .align 3 + +.L11: + andi I, N, 0x0f + bge $r0, I, .L13 + .align 3 + +.L12: /* 0 < N < 16 */ + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMAXA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 0 + xvinsgr2vr.w VM1, t2, 1 + xvinsgr2vr.w VM1, t3, 2 + xvinsgr2vr.w VM1, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 4 + xvinsgr2vr.w VM1, t2, 5 + xvinsgr2vr.w VM1, t3, 6 + xvinsgr2vr.w VM1, t4, 7 + xvfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMAXA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 1 + XVFMAXA VM0, VM0, VM1 + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: /* 0 < N < 8 */ + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMAXA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amax_lsx.S b/kernel/loongarch64/amax_lsx.S new file mode 100644 index 000000000..fb3b77a0e --- /dev/null +++ b/kernel/loongarch64/amax_lsx.S @@ -0,0 +1,231 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $vr0 +#define VM1 $vr1 +#define VM2 $vr2 +#define VX0 $vr3 +#define VX1 $vr4 +#define VX2 $vr5 +#define VX3 $vr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + vldrepl.d VM0, X, 0 +#else + vldrepl.w VM0, X, 0 +#endif + VFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 3 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, X, 32 + vld VX3, X, 48 + addi.d I, I, -1 + addi.d X, X, 64 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM0, VM1 + VFMAXA VM0, VM0, VM2 +#else + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + addi.d X, X, 32 + VFMAXA VM1, VX0, VX1 + VFMAXA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMAXA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM1, VM2 +#endif + .align 3 + +.L11: + andi I, N, 7 + bge $r0, I, .L13 + .align 3 + +.L12: + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMAXA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM1, VM1, VM2 + vfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmaxa.s VM1, VX0, VX1 + vfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMAXA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM1, VM2 +#endif + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMAXA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amin_lasx.S b/kernel/loongarch64/amin_lasx.S new file mode 100644 index 000000000..0a4359002 --- /dev/null +++ b/kernel/loongarch64/amin_lasx.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $xr0 +#define VM1 $xr1 +#define VM2 $xr2 +#define VX0 $xr3 +#define VX1 $xr4 +#define VX2 $xr5 +#define VX3 $xr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + xvldrepl.d VM0, X, 0 +#else + xvldrepl.w VM0, X, 0 +#endif + XVFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 4 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvld VX2, X, 64 + xvld VX3, X, 96 + addi.d I, I, -1 + addi.d X, X, 128 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM0, VM1 + XVFMINA VM0, VM0, VM2 +#else + xvld VX0, X, 0 + xvld VX1, X, 32 + addi.d I, I, -1 + addi.d X, X, 64 + XVFMINA VM1, VX0, VX1 + XVFMINA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMINA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 0x1 + XVFMINA VM0, VM0, VM1 + .align 3 + +.L11: + andi I, N, 0x0f + bge $r0, I, .L13 + .align 3 + +.L12: /* 0 < N < 16 */ + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMINA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 0 + xvinsgr2vr.w VM1, t2, 1 + xvinsgr2vr.w VM1, t3, 2 + xvinsgr2vr.w VM1, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 4 + xvinsgr2vr.w VM1, t2, 5 + xvinsgr2vr.w VM1, t3, 6 + xvinsgr2vr.w VM1, t4, 7 + xvfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMINA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 1 + XVFMINA VM0, VM0, VM1 + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: /* 0 < N < 8 */ + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMINA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S new file mode 100644 index 000000000..644caf43c --- /dev/null +++ b/kernel/loongarch64/amin_lsx.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $vr0 +#define VM1 $vr1 +#define VM2 $vr2 +#define VX0 $vr3 +#define VX1 $vr4 +#define VX2 $vr5 +#define VX3 $vr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + vldrepl.d VM0, X, 0 +#else + vldrepl.w VM0, X, 0 +#endif + VFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 3 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, X, 32 + vld VX3, X, 48 + addi.d I, I, -1 + addi.d X, X, 64 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM0, VM1 + VFMINA VM0, VM0, VM2 +#else + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + addi.d X, X, 32 + VFMINA VM1, VX0, VX1 + VFMINA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMINA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM1, VM2 +#endif + .align 3 + +.L11: + andi I, N, 7 + bge $r0, I, .L13 + .align 3 + +.L12: + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMINA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + SUB $f0, $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM1, VM1, VM2 + vfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmaxa.s VM1, VX0, VX1 + vfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMINA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM1, VM2 +#endif + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMINA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/asum_lasx.S b/kernel/loongarch64/asum_lasx.S new file mode 100644 index 000000000..9a2c031f3 --- /dev/null +++ b/kernel/loongarch64/asum_lasx.S @@ -0,0 +1,257 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define VT0 $xr23 +#define VT1 $xr22 +#define res1 $xr16 +#define res2 $xr17 +#define res0 $xr18 +#define neg1 $xr19 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + xvreplgr2vr.d neg1, t1 + xvffint.d.l neg1, neg1 +#else + li.w t1, -1 + xvreplgr2vr.w neg1, t1 + xvffint.s.w neg1, neg1 +#endif + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvfmul.s VX2, neg1, VX0 + xvfcmp.clt.s VT0, VX0, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvfadd.s res1, VX0, res1 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VX2, neg1, VX0 + xvfcmp.clt.s VT0, VX0, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvfadd.s res1, VX0, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/asum_lsx.S b/kernel/loongarch64/asum_lsx.S new file mode 100644 index 000000000..512b01404 --- /dev/null +++ b/kernel/loongarch64/asum_lsx.S @@ -0,0 +1,258 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define VT0 $vr23 +#define VT1 $vr22 +#define res1 $vr16 +#define res2 $vr17 +#define res0 $vr18 +#define neg1 $vr19 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + vreplgr2vr.d neg1, t1 + vffint.d.l neg1, neg1 +#else + li.w t1, -1 + vreplgr2vr.w neg1, t1 + vffint.s.w neg1, neg1 +#endif + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/axpby_lasx.S b/kernel/loongarch64/axpby_lasx.S new file mode 100644 index 000000000..f1d99cd3b --- /dev/null +++ b/kernel/loongarch64/axpby_lasx.S @@ -0,0 +1,1050 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r5 +#define INCX $r6 +#define BETA $f1 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXA $xr23 +#define VXB $xr9 +#define VXZ $xr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + MTG t1, ALPHA + MTG t2, BETA + MTG t3, a1 +#ifdef DOUBLE + xvreplgr2vr.d VXA, t1 + xvreplgr2vr.d VXB, t2 + xvreplgr2vr.d VXZ, t3 +#else + xvreplgr2vr.w VXA, t1 + xvreplgr2vr.w VXB, t2 + xvreplgr2vr.w VXZ, t3 +#endif + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 + b .L111 // ALPHA!=0 BETA!=0 + .align 3 + +.L110: + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L114 // ALPHA==0 BETA==0 + b .L113 // ALPHA==0 BETA!=0 + .align 3 + +.L111: // ALPHA!=0 BETA!=0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmul.d VX0, VX0, VXA + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX2, VX2, VXB, VX0 + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvfmadd.s VX2, VX2, VXB, VX0 + xvst VX2, Y, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // ALPHA!=0 BETA==0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvfmul.d VX0, VX0, VXA + xvfmul.d VX1, VX1, VXA + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE +#else + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // ALPHA==0 BETA!=0 + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX3, Y, 4 * SIZE + xvfmul.d VX2, VX2, VXB + xvfmul.d VX3, VX3, VXB + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE +#else + xvfmul.s VX2, VX2, VXB + xvst VX2, Y, 0 * SIZE +#endif + addi.d I, I, -1 + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // ALPHA==0 BETA==0 + xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L120 + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 + b .L121 // ALPHA!=0 BETA!=0 + .align 3 + +.L120: + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L124 // ALPHA==0 BETA==0 + b .L123 // ALPHA==0 BETA!=0 + .align 3 + +.L121: // ALPHA!=0 BETA!=0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, VX0, VXA + xvld VX1, X, 4 * SIZE + xvfmadd.d VX2, VX2, VXB, VX0 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // ALPHA!=0 BETA==0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvfmul.d VX0, VX0, VXA + xvfmul.d VX1, VX1, VXA + xvstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 +#else + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // ALPHA==0 BETA!=0 +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX2, VX2, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX3, VX3, VXB + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // ALPHA==0 BETA==0 +#ifdef DOUBLE + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L210 + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 + b .L211 // ALPHA!=0 BETA!=0 + .align 3 + +.L210: + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L214 // ALPHA==0 BETA==0 + b .L213 // ALPHA==0 BETA!=0 + .align 3 + +.L211: // ALPHA!=0 BETA!=0 + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VX0, VXA, VX0 + xvfmadd.d VX2, VX2, VXB, VX0 + xvld VX3, Y, 4 * SIZE + xvst VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvst VX3, Y, 4 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // ALPHA!=0 BETA==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VX0, VXA, VX0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvst VX0, Y, 0 * SIZE + xvfmul.d VX1, VX1, VXA + addi.d I, I, -1 + xvst VX1, Y, 4 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // ALPHA==0 BETA!=0 + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX3, Y, 4 * SIZE + xvfmul.d VX2, VX2, VXB + xvfmul.d VX3, VX3, VXB + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE +#else + xvfmul.s VX2, VX2, VXB + xvst VX2, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // ALPHA==0 BETA==0 + xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L214 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 + b .L221 // ALPHA!=0 BETA!=0 + .align 3 + +.L220: + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L224 // ALPHA==0 BETA==0 + b .L223 // ALPHA==0 BETA!=0 + .align 3 + +.L221: // ALPHA!=0 BETA!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, VX0, VXA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvfmadd.d VX2, VX2, VXB, VX0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif + add.d YY, YY, INCY + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // ALPHA!=0 BETA==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VX0, VX0, VXA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX1, VX1, VXA + addi.d I, I, -1 + xvstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // ALPHA==0 BETA!=0 +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX2, VX2, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX3, VX3, VXB + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // ALPHA==0 BETA==0 +#ifdef DOUBLE + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE + addi.d I, I, -1 + MUL $f12, $f12, ALPHA + MADD $f13, $f13, BETA, $f12 + ST $f13, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/axpby_lsx.S b/kernel/loongarch64/axpby_lsx.S new file mode 100644 index 000000000..45154c262 --- /dev/null +++ b/kernel/loongarch64/axpby_lsx.S @@ -0,0 +1,1148 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r5 +#define INCX $r6 +#define BETA $f1 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXA $vr23 +#define VXB $vr9 +#define VXZ $vr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + MTG t1, ALPHA + MTG t2, BETA + MTG t3, a1 +#ifdef DOUBLE + vreplgr2vr.d VXA, t1 + vreplgr2vr.d VXB, t2 + vreplgr2vr.d VXZ, t3 +#else + vreplgr2vr.w VXA, t1 + vreplgr2vr.w VXB, t2 + vreplgr2vr.w VXZ, t3 +#endif + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 + b .L111 // ALPHA!=0 BETA!=0 + .align 3 + +.L110: + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L114 // ALPHA==0 BETA==0 + b .L113 // ALPHA==0 BETA!=0 + .align 3 + +.L111: // ALPHA!=0 BETA!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vfmadd.d VX2, VX2, VXB, VX0 + vfmadd.d VX3, VX3, VXB, VX1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vfmadd.d VX2, VX2, VXB, VX0 + vfmadd.d VX3, VX3, VXB, VX1 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vfmadd.s VX2, VX2, VXB, VX0 + vfmadd.s VX3, VX3, VXB, VX1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // ALPHA!=0 BETA==0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfmul.d VX2, VX2, VXA + vfmul.d VX3, VX3, VXA + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // ALPHA==0 BETA!=0 +#ifdef DOUBLE + vld VX0, Y, 0 * SIZE + vld VX1, Y, 2 * SIZE + vfmul.d VX0, VX0, VXB + vfmul.d VX1, VX1, VXB + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VX2, VX2, VXB + vfmul.d VX3, VX3, VXB + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // ALPHA==0 BETA==0 + vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + vst VXZ, Y, 2 * SIZE + vst VXZ, Y, 4 * SIZE + vst VXZ, Y, 6 * SIZE +#else + vst VXZ, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L120 + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 + b .L121 // ALPHA!=0 BETA!=0 + .align 3 + +.L120: + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L124 // ALPHA==0 BETA==0 + b .L123 // ALPHA==0 BETA!=0 + .align 3 + +.L121: // ALPHA!=0 BETA!=0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vfmul.d VX0, VX0, VXA + vld VX1, X, 2 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + vld VX0, X, 4 * SIZE + vfmadd.d VX3, VX3, VXB, VX1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX0, VX0, VXA + vld VX1, X, 6 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + vld VX1, X, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // ALPHA!=0 BETA==0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 +#else + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // ALPHA==0 BETA!=0 +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif + add.d YY, YY, INCY + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // ALPHA==0 BETA==0 +#ifdef DOUBLE + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L210 + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 + b .L211 // ALPHA!=0 BETA!=0 + .align 3 + +.L210: + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L214 // ALPHA==0 BETA==0 + b .L213 // ALPHA==0 BETA!=0 + .align 3 + +.L211: // ALPHA!=0 BETA!=0 + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VX0, VXA, VX0 + vld VX3, Y, 2 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX2, Y, 0 * SIZE + vfmul.d VX1, VXA, VX1 + vld VX2, Y, 4 * SIZE + vfmadd.d VX3, VX3, VXB, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vst VX3, Y, 2 * SIZE + vfmul.d VX0, VX0, VXA + vld VX3, Y, 6 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX2, Y, 4 * SIZE + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vst VX3, Y, 6 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX2, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // ALPHA!=0 BETA==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VX0, VXA, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 * SIZE + vfmul.d VX1, VXA, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vst VX1, Y, 2 * SIZE + vfmul.d VX0, VX0, VXA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 4 * SIZE + vfmul.d VX1, VX1, VXA + addi.d I, I, -1 + vst VX1, Y, 6 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX0, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vst VX1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // ALPHA==0 BETA!=0 + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX3, Y, 2 * SIZE + vfmul.d VX2, VX2, VXB + vfmul.d VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VX2, VX2, VXB + vfmul.d VX3, VX3, VXB + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // ALPHA==0 BETA==0 + vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + vst VXZ, Y, 2 * SIZE + vst VXZ, Y, 4 * SIZE + vst VXZ, Y, 6 * SIZE +#else + vst VXZ, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L214 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 + b .L221 // ALPHA!=0 BETA!=0 + .align 3 + +.L220: + CMPEQ $fcc0, BETA, a1 + bcnez $fcc0, .L224 // ALPHA==0 BETA==0 + b .L223 // ALPHA==0 BETA!=0 + .align 3 + +.L221: // ALPHA!=0 BETA!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, VX0, VXA + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, VX0, VXA + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vfmadd.s VX2, VX2, VXB, VX0 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vfmadd.s VX3, VX3, VXB, VX1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif + add.d YY, YY, INCY + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // ALPHA!=0 BETA==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VX0, VX0, VXA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX0, VX0, VXA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + addi.d I, I, -1 + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // ALPHA==0 BETA!=0 +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // ALPHA==0 BETA==0 +#ifdef DOUBLE + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE + addi.d I, I, -1 + MUL $f12, $f12, ALPHA + MADD $f13, $f13, BETA, $f12 + ST $f13, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/axpy_lasx.S b/kernel/loongarch64/axpy_lasx.S new file mode 100644 index 000000000..707fd09b5 --- /dev/null +++ b/kernel/loongarch64/axpy_lasx.S @@ -0,0 +1,529 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXA $xr23 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + movgr2fr.d a2, TEMP + FFINT a2, a2 + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L999 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + MTG t1, ALPHA +#ifdef DOUBLE + xvreplgr2vr.d VXA, t1 +#else + xvreplgr2vr.w VXA, t1 +#endif + + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L113 + CMPEQ $fcc0, ALPHA, a2 + bceqz $fcc0, .L112 + .align 3 + +.L111: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfadd.d VX2, VX0, VX2 + xvfadd.d VX3, VX1, VX3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfadd.s VX2, VX0, VX2 + xvst VX2, Y, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L113 + .align 3 + +.L112: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmadd.d VX2, VX0, VXA, VX2 + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfmadd.s VX2, VX0, VXA, VX2 + xvst VX2, Y, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L112 + .align 3 + +.L113: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L114: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L114 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX2, VX0, VXA, VX2 + xvld VX1, X, 4 * SIZE + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmadd.d VX2, VX0, VXA, VX2 + xvld VX3, Y, 4 * SIZE + xvst VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + move YY, Y + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX2, VX0, VXA, VX2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/axpy_lsx.S b/kernel/loongarch64/axpy_lsx.S new file mode 100644 index 000000000..0d74e2bce --- /dev/null +++ b/kernel/loongarch64/axpy_lsx.S @@ -0,0 +1,573 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXA $vr23 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + movgr2fr.d a2, TEMP + FFINT a2, a2 + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L999 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + MTG t1, ALPHA +#ifdef DOUBLE + vreplgr2vr.d VXA, t1 +#else + vreplgr2vr.w VXA, t1 +#endif + + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L113 + CMPEQ $fcc0, ALPHA, a2 + bceqz $fcc0, .L112 + .align 3 + +.L111: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfadd.d VX2, VX0, VX2 + vfadd.d VX3, VX1, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vfadd.d VX2, VX0, VX2 + vfadd.d VX3, VX1, VX3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfadd.s VX2, VX0, VX2 + vfadd.s VX3, VX1, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L113 + .align 3 + +.L112: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfmadd.d VX2, VX0, VXA, VX2 + vfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + vfmadd.d VX2, VX0, VXA, VX2 + vfmadd.d VX3, VX1, VXA, VX3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX0, VXA, VX2 + vfmadd.s VX3, VX1, VXA, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 +#endif + blt $r0, I, .L112 + .align 3 + +.L113: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L114: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L114 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + vld VX1, X, 2 * SIZE + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + vld VX0, X, 4 * SIZE + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + vld VX1, X, 6 * SIZE + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + vld VX1, X, 4 * SIZE + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE +#endif + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmadd.d VX2, VX0, VXA, VX2 + vld VX3, Y, 2 * SIZE + vst VX2, Y, 0 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmadd.d VX3, VX1, VXA, VX3 + vld VX2, Y, 4 * SIZE + vst VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmadd.d VX2, VX0, VXA, VX2 + vld VX3, Y, 6 * SIZE + vst VX2, Y, 4 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX3, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmadd.s VX2, VX0, VXA, VX2 + vld VX3, Y, 4 * SIZE + vst VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + move YY, Y + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camax_lasx.S b/kernel/loongarch64/camax_lasx.S new file mode 100644 index 000000000..7013430cb --- /dev/null +++ b/kernel/loongarch64/camax_lasx.S @@ -0,0 +1,194 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VT0 $xr13 +#define VT1 $xr14 +#define res0 $xr18 +#define neg1 $xr19 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 + + PROLOGUE + xvxor.v VM0, VM0, VM0 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + xvreplgr2vr.w neg1, I + xvffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, neg1, x1 + xvfmul.s x4, neg1, x2 + xvfcmp.clt.s VT0, x1, res0 + xvfcmp.clt.s VT1, x2, res0 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VT1 + addi.d X, X, 16 * SIZE + xvfadd.s VM1, x1, x2 + xvfmax.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmax.s VM1, x1, x2 + xvfmax.s VM0, x3, x4 + xvfmax.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmax.s s1, s1, s2 + fmax.s s3, s3, s4 + fmax.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + fmax.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camax_lsx.S b/kernel/loongarch64/camax_lsx.S new file mode 100644 index 000000000..2e55629de --- /dev/null +++ b/kernel/loongarch64/camax_lsx.S @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VT0 $vr13 +#define VT1 $vr14 +#define res0 $vr18 +#define neg1 $vr19 +#define VX0 $vr20 +#define VX1 $vr21 +#define VM0 $vr22 +#define VM1 $vr23 + + PROLOGUE + vxor.v VM0, VM0, VM0 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + vreplgr2vr.w neg1, I + vffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + vld VX0, X, 8 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vld VX1, X, 12 * SIZE + vfadd.s VM1, x1, x2 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + addi.d X, X, 16 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vfadd.s x1, x1, x2 + vfmax.s VM1, x1, VM1 + vfmax.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmax.s VM1, x1, x2 + vfmax.s VM0, x3, x4 + vfmax.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmax.s s1, s1, s2 + fmax.s s3, s3, s4 + fmax.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + addi.d I, I, -1 + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s a0, a0, a1 + add.d X, X, INCX + fmax.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camin_lasx.S b/kernel/loongarch64/camin_lasx.S new file mode 100644 index 000000000..d7931d30a --- /dev/null +++ b/kernel/loongarch64/camin_lasx.S @@ -0,0 +1,199 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define TEMP $r16 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define a0 $f20 +#define a1 $f21 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VT0 $xr13 +#define VT1 $xr14 +#define res0 $xr18 +#define neg1 $xr19 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 + + PROLOGUE + MTC s1, $r0 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s s1, a1, a0 + xvreplve0.w VM0, VM0 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + xvreplgr2vr.w neg1, I + xvffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, neg1, x1 + xvfmul.s x4, neg1, x2 + xvfcmp.clt.s VT0, x1, res0 + xvfcmp.clt.s VT1, x2, res0 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VT1 + addi.d X, X, 16 * SIZE + xvfadd.s VM1, x1, x2 + xvfmin.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmin.s VM1, x1, x2 + xvfmin.s VM0, x3, x4 + xvfmin.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmin.s s1, s1, s2 + fmin.s s3, s3, s4 + fmin.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + fmin.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camin_lsx.S b/kernel/loongarch64/camin_lsx.S new file mode 100644 index 000000000..e9ad6b04d --- /dev/null +++ b/kernel/loongarch64/camin_lsx.S @@ -0,0 +1,211 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VT0 $vr13 +#define VT1 $vr14 +#define res0 $vr18 +#define neg1 $vr19 +#define VX0 $vr20 +#define VX1 $vr21 +#define VM0 $vr22 +#define VM1 $vr23 + + PROLOGUE + MTC s1, $r0 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s s1, a1, a0 + vreplvei.w VM0, VM0, 0 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + vreplgr2vr.w neg1, I + vffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + vld VX0, X, 8 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vld VX1, X, 12 * SIZE + vfadd.s VM1, x1, x2 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + addi.d X, X, 16 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vfadd.s x1, x1, x2 + vfmin.s VM1, x1, VM1 + vfmin.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmin.s VM1, x1, x2 + vfmin.s VM0, x3, x4 + vfmin.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmin.s s1, s1, s2 + fmin.s s3, s3, s4 + fmin.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + addi.d I, I, -1 + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s a0, a0, a1 + add.d X, X, INCX + fmin.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/casum_lasx.S b/kernel/loongarch64/casum_lasx.S new file mode 100644 index 000000000..caf0ff969 --- /dev/null +++ b/kernel/loongarch64/casum_lasx.S @@ -0,0 +1,329 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 +#define res3 $xr18 +#define res0 $xr19 +#define neg1 $xr20 +#define VT0 $xr21 +#define VT1 $xr22 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + xvreplgr2vr.d neg1, t1 + xvffint.d.l neg1, neg1 +#else + li.w t1, -1 + xvreplgr2vr.w neg1, t1 + xvffint.s.w neg1, neg1 +#endif + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvfmul.d VX0, neg1, VX2 + xvfmul.d VX1, neg1, VX3 + xvfcmp.clt.d VT0, VX2, res0 + xvfcmp.clt.d VT1, VX3, res0 + xvbitsel.v VX2, VX2, VX0, VT0 + xvbitsel.v VX3, VX3, VX1, VT1 + xvfadd.d res2, VX2, VX3 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvfmul.s VX2, neg1, VX0 + xvfmul.s VX3, neg1, VX1 + xvfcmp.clt.s VT0, VX0, res0 + xvfcmp.clt.s VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvfmul.s VX2, neg1, VX0 + xvfmul.s VX3, neg1, VX1 + xvfcmp.clt.s VT0, VX0, res0 + xvfcmp.clt.s VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/casum_lsx.S b/kernel/loongarch64/casum_lsx.S new file mode 100644 index 000000000..4822f2080 --- /dev/null +++ b/kernel/loongarch64/casum_lsx.S @@ -0,0 +1,358 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 +#define res0 $vr19 +#define neg1 $vr20 +#define VT0 $vr21 +#define VT1 $vr22 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + vreplgr2vr.d neg1, t1 + vffint.d.l neg1, neg1 +#else + li.w t1, -1 + vreplgr2vr.w neg1, t1 + vffint.s.w neg1, neg1 +#endif + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfmul.d VX0, neg1, VX2 + vfmul.d VX1, neg1, VX3 + vfcmp.clt.d VT0, VX2, res0 + vfcmp.clt.d VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vfmul.d VX0, neg1, VX2 + vfmul.d VX1, neg1, VX3 + vfcmp.clt.d VT0, VX2, res0 + vfcmp.clt.d VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + addi.d I, I, -1 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vld VX0, X, 8 * SIZE + vld VX1, X, 12 * SIZE + addi.d I, I, -1 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res3, VX1, VX0 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d X, X, 16 * SIZE + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vfmul.s VX0, neg1, VX2 + vfmul.s VX1, neg1, VX3 + vfcmp.clt.s VT0, VX2, res0 + vfcmp.clt.s VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/caxpy_lasx.S b/kernel/loongarch64/caxpy_lasx.S new file mode 100644 index 000000000..2b970fe70 --- /dev/null +++ b/kernel/loongarch64/caxpy_lasx.S @@ -0,0 +1,707 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXAR $xr23 +#define VXAI $xr19 +#define x1 $xr18 +#define x2 $xr17 +#define x3 $xr16 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L10 + bcnez $fcc1, .L999 +.L10: + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, ALPHAR + MTG t2, ALPHAI +#ifdef DOUBLE + xvreplgr2vr.d VXAR, t1 + xvreplgr2vr.d VXAI, t2 + srai.d I, N, 2 +#else + xvreplgr2vr.w VXAR, t1 + xvreplgr2vr.w VXAI, t2 + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 2 + xvinsgr2vr.d x4, t4, 2 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 1 + xvinsgr2vr.d x4, t2, 1 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + xvld VX1, X, 8 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 +#else + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + addi.d X, X, 16 * SIZE +#endif + blt $r0, I, .L121 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 2 + xvinsgr2vr.d x2, t4, 2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 1 + xvinsgr2vr.d x2, t2, 1 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvld VX3, Y, 8 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif + addi.d I, I, -1 +#ifdef DOUBLE + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 +#else + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + addi.d I, I, -1 +#if !defined(CONJ) + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s3, ALPHAR, a1, s1 + MADD s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + ADD s4, s4, a4 +#else + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MADD s3, ALPHAR, a1, s1 + MSUB s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + SUB s4, a4, s4 +#endif + ST s3, Y, 0 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/caxpy_lsx.S b/kernel/loongarch64/caxpy_lsx.S new file mode 100644 index 000000000..85598d0b9 --- /dev/null +++ b/kernel/loongarch64/caxpy_lsx.S @@ -0,0 +1,679 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXAR $vr23 +#define VXAI $vr19 +#define x1 $vr18 +#define x2 $vr17 +#define x3 $vr16 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L10 + bcnez $fcc1, .L999 +.L10: + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, ALPHAR + MTG t2, ALPHAI +#ifdef DOUBLE + vreplgr2vr.d VXAR, t1 + vreplgr2vr.d VXAI, t2 +#else + vreplgr2vr.w VXAR, t1 + vreplgr2vr.w VXAI, t2 +#endif + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 +#else + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + addi.d I, I, -1 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + addi.d I, I, -1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 +#endif + add.d Y, Y, INCY +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 +#else + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + addi.d I, I, -1 +#if !defined(CONJ) + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s3, ALPHAR, a1, s1 + MADD s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + ADD s4, s4, a4 +#else + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MADD s3, ALPHAR, a1, s1 + MSUB s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + SUB s4, a4, s4 +#endif + ST s3, Y, 0 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/ccopy_lasx.S b/kernel/loongarch64/ccopy_lasx.S new file mode 100644 index 000000000..fbc5d96bc --- /dev/null +++ b/kernel/loongarch64/ccopy_lasx.S @@ -0,0 +1,386 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + xvst VX2, Y, 8 * SIZE + xvst VX3, Y, 12 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 8 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 0 + xvstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0 * SIZE, 2 + xvstelm.d VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0 * SIZE, 0 + xvstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0 * SIZE, 2 + xvstelm.d VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX2, Y, 0 * SIZE, 0 + xvstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX2, Y, 0 * SIZE, 2 + xvstelm.d VX2, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX3, Y, 0 * SIZE, 0 + xvstelm.d VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX3, Y, 0 * SIZE, 2 + xvstelm.d VX3, Y, 1 * SIZE, 3 +#else + xvld VX1, X, 8 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 0 + xvstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 2 + xvstelm.w VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 4 + xvstelm.w VX0, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 6 + xvstelm.w VX0, Y, 1 * SIZE, 7 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 0 + xvstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 2 + xvstelm.w VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 4 + xvstelm.w VX1, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 6 + xvstelm.w VX1, Y, 1 * SIZE, 7 +#endif + add.d Y, Y, INCY + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + xvst VX2, Y, 8 * SIZE + xvst VX3, Y, 12 * SIZE +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 8 * SIZE +#endif + addi.d I, I, -1 + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/ccopy_lsx.S b/kernel/loongarch64/ccopy_lsx.S new file mode 100644 index 000000000..4c4d880f1 --- /dev/null +++ b/kernel/loongarch64/ccopy_lsx.S @@ -0,0 +1,411 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11:// INCX==1 and INCY==1 + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + addi.d I, I, -1 + vst VX0, Y, 8 * SIZE + vst VX1, Y, 10 * SIZE + vst VX2, Y, 12 * SIZE + vst VX3, Y, 14 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + addi.d I, I, -1 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + vst VX2, Y, 8 * SIZE + vst VX3, Y, 12 * SIZE +#endif + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + vstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0 * SIZE, 0 + vstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX2, Y, 0 * SIZE, 0 + vstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX3, Y, 0 * SIZE, 0 + vstelm.d VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + vstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0 * SIZE, 0 + vstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX2, Y, 0 * SIZE, 0 + vstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX3, Y, 0 * SIZE, 0 + vstelm.d VX3, Y, 1 * SIZE, 1 +#else + vld VX1, X, 4 * SIZE + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 0 + vstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0 * SIZE, 2 + vstelm.w VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0 * SIZE, 0 + vstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0 * SIZE, 2 + vstelm.w VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX2, Y, 0 * SIZE, 0 + vstelm.w VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX2, Y, 0 * SIZE, 2 + vstelm.w VX2, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX3, Y, 0 * SIZE, 0 + vstelm.w VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX3, Y, 0 * SIZE, 2 + vstelm.w VX3, Y, 1 * SIZE, 3 +#endif + add.d Y, Y, INCY + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 4 * SIZE + vst VX1, Y, 6 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 8 * SIZE + vst VX1, Y, 10 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 12 * SIZE + vst VX1, Y, 14 * SIZE +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + vst VX2, Y, 8 * SIZE + vst VX3, Y, 12 * SIZE +#endif + addi.d Y, Y, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cdot_lasx.S b/kernel/loongarch64/cdot_lasx.S new file mode 100644 index 000000000..0583e56ea --- /dev/null +++ b/kernel/loongarch64/cdot_lasx.S @@ -0,0 +1,565 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r19 +#define TEMP $r10 +#define t1 $r11 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define res1 $xr16 +#define res2 $xr17 +#define res3 $xr18 +#define res4 $xr19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define x1 $xr20 +#define x2 $xr21 +#define x3 $xr22 +#define x4 $xr23 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res3, res3, res3 + xvxor.v res4, res4, res4 + bge $r0, N, .L999 + li.d TEMP, 2 * SIZE + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT +#ifdef DOUBLE + srai.d I, N, 2 +#else + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L996 + .align 3 + +.L12: + bge $r0, I, .L997 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 2 + xvinsgr2vr.d x4, t4, 2 + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 1 + xvinsgr2vr.d x4, t2, 1 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + addi.d X, X, 8 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + xvld VX1, X, 8 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + addi.d X, X, 16 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L21: + bge $r0, I, .L997 + .align 3 + +.L211: + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 2 + xvinsgr2vr.d x2, t4, 2 + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 1 + xvinsgr2vr.d x2, t2, 1 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + addi.d Y, Y, 8 * SIZE + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvld VX3, Y, 8 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + addi.d Y, Y, 8 * SIZE + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + b .L996 + .align 3 + +.L22: + bge $r0, I, .L997 + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L996: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + xvpickve.d VX1, res2, 1 + xvpickve.d VX2, res2, 2 + xvpickve.d VX3, res2, 3 + xvfadd.d res2, VX1, res2 + xvfadd.d res2, VX2, res2 + xvfadd.d res2, VX3, res2 + xvpickve.d VX1, res3, 1 + xvpickve.d VX2, res3, 2 + xvpickve.d VX3, res3, 3 + xvfadd.d res3, VX1, res3 + xvfadd.d res3, VX2, res3 + xvfadd.d res3, VX3, res3 + xvpickve.d VX1, res4, 1 + xvpickve.d VX2, res4, 2 + xvpickve.d VX3, res4, 3 + xvfadd.d res4, VX1, res4 + xvfadd.d res4, VX2, res4 + xvfadd.d res4, VX3, res4 +#else + xvpickve.w VX0, res1, 1 + xvpickve.w VX1, res1, 2 + xvpickve.w VX2, res1, 3 + xvpickve.w VX3, res1, 4 + xvpickve.w x1, res1, 5 + xvpickve.w x2, res1, 6 + xvpickve.w x3, res1, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvfadd.s res1, x1, res1 + xvfadd.s res1, x2, res1 + xvfadd.s res1, x3, res1 + xvpickve.w VX0, res2, 1 + xvpickve.w VX1, res2, 2 + xvpickve.w VX2, res2, 3 + xvpickve.w VX3, res2, 4 + xvpickve.w x1, res2, 5 + xvpickve.w x2, res2, 6 + xvpickve.w x3, res2, 7 + xvfadd.s res2, VX0, res2 + xvfadd.s res2, VX1, res2 + xvfadd.s res2, VX2, res2 + xvfadd.s res2, VX3, res2 + xvfadd.s res2, x1, res2 + xvfadd.s res2, x2, res2 + xvfadd.s res2, x3, res2 + xvpickve.w VX0, res3, 1 + xvpickve.w VX1, res3, 2 + xvpickve.w VX2, res3, 3 + xvpickve.w VX3, res3, 4 + xvpickve.w x1, res3, 5 + xvpickve.w x2, res3, 6 + xvpickve.w x3, res3, 7 + xvfadd.s res3, VX0, res3 + xvfadd.s res3, VX1, res3 + xvfadd.s res3, VX2, res3 + xvfadd.s res3, VX3, res3 + xvfadd.s res3, x1, res3 + xvfadd.s res3, x2, res3 + xvfadd.s res3, x3, res3 + xvpickve.w VX0, res4, 1 + xvpickve.w VX1, res4, 2 + xvpickve.w VX2, res4, 3 + xvpickve.w VX3, res4, 4 + xvpickve.w x1, res4, 5 + xvpickve.w x2, res4, 6 + xvpickve.w x3, res4, 7 + xvfadd.s res4, VX0, res4 + xvfadd.s res4, VX1, res4 + xvfadd.s res4, VX2, res4 + xvfadd.s res4, VX3, res4 + xvfadd.s res4, x1, res4 + xvfadd.s res4, x2, res4 + xvfadd.s res4, x3, res4 +#endif + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MADD s1, a1, a3, s1 + MADD s2, a2, a3, s2 + MADD s3, a1, a4, s3 + MADD s4, a2, a4, s4 + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: +#ifndef CONJ + SUB $f0, s1, s4 + ADD $f1, s3, s2 +#else + ADD $f0, s1, s4 + SUB $f1, s3, s2 +#endif + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cdot_lsx.S b/kernel/loongarch64/cdot_lsx.S new file mode 100644 index 000000000..5feea12be --- /dev/null +++ b/kernel/loongarch64/cdot_lsx.S @@ -0,0 +1,397 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r19 +#define TEMP $r10 +#define t1 $r11 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 +#define res4 $vr19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define x1 $vr20 +#define x2 $vr21 +#define x3 $vr22 +#define x4 $vr23 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res3, res3, res3 + vxor.v res4, res4, res4 + bge $r0, N, .L999 + li.d TEMP, 2 * SIZE + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT +#ifdef DOUBLE + srai.d I, N, 1 +#else + srai.d I, N, 2 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L996 + .align 3 + +.L12: + bge $r0, I, .L997 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + addi.d X, X, 4 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + addi.d X, X, 8 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L21: + bge $r0, I, .L997 + .align 3 + +.L211: + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + addi.d Y, Y, 4 * SIZE + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + addi.d Y, Y, 8 * SIZE + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + b .L996 + .align 3 + +.L22: + bge $r0, I, .L997 + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L996: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + vreplvei.d VX1, res2, 1 + vfadd.d res2, VX1, res2 + vreplvei.d VX1, res3, 1 + vfadd.d res3, VX1, res3 + vreplvei.d VX1, res4, 1 + vfadd.d res4, VX1, res4 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 + vreplvei.w VX1, res2, 1 + vreplvei.w VX2, res2, 2 + vreplvei.w VX3, res2, 3 + vfadd.s res2, VX1, res2 + vfadd.s res2, VX2, res2 + vfadd.s res2, VX3, res2 + vreplvei.w VX1, res3, 1 + vreplvei.w VX2, res3, 2 + vreplvei.w VX3, res3, 3 + vfadd.s res3, VX1, res3 + vfadd.s res3, VX2, res3 + vfadd.s res3, VX3, res3 + vreplvei.w VX1, res4, 1 + vreplvei.w VX2, res4, 2 + vreplvei.w VX3, res4, 3 + vfadd.s res4, VX1, res4 + vfadd.s res4, VX2, res4 + vfadd.s res4, VX3, res4 +#endif + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 1 +#else + andi I, N, 3 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MADD s1, a1, a3, s1 + MADD s2, a2, a3, s2 + MADD s3, a1, a4, s3 + MADD s4, a2, a4, s4 + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: +#ifndef CONJ + SUB $f0, s1, s4 + ADD $f1, s3, s2 +#else + ADD $f0, s1, s4 + SUB $f1, s3, s2 +#endif + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cgemm_kernel_2x2_lasx.S b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S new file mode 100644 index 000000000..e07f7dc64 --- /dev/null +++ b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S @@ -0,0 +1,857 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 + +/* LASX vectors */ +#define U0 $xr30 +#define U1 $xr31 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define VALPHAR $xr28 +#define VALPHAI $xr29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVNMSUB +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VNMSUB +#define VMADD4 VFMADD + +#define XVFADD1 XVFADD +#define XVFADD2 XVFADD +#define XVFADD3 XVFSUB +#define XVFADD4 XVFADD + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVFMADD +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VFMADD +#define VMADD4 VNMSUB + +#define XVFADD1 XVFADD +#define XVFADD2 XVFADD +#define XVFADD3 XVFADD +#define XVFADD4 XVFSUB + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVFMADD +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VFMADD +#define VMADD4 VFMADD + +#define XVFADD1 XVFADD +#define XVFADD2 XVFSUB +#define XVFADD3 XVFADD +#define XVFADD4 XVFADD + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVNMSUB +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VNMSUB +#define VMADD4 VNMSUB + +#define XVFADD1 XVFADD +#define XVFADD2 XVFSUB +#define XVFADD3 XVFSUB +#define XVFADD4 XVFSUB + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA_R,$sp, 112 + ST ALPHA_I,$sp, 120 + + xvldrepl.w VALPHAR, $sp, 112 + xvldrepl.w VALPHAI, $sp, 120 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, 2 + + move J, $r0 + srai.d T0, N, 1 + beq J, T0, .L19 + +.L10: /* for(j=0; j 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x00 + xvld U2, S2, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U2, U1, 0x31 + + xvpermi.d U0, U0, 0xd8 + xvpermi.d U2, U2, 0xd8 + + xvst U0, TD, 0x00 + xvst U2, TD, 0x20 + + addi.d S1, S1, 0x20 // a_offset1 + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + xvld U0, TS, 0x00 + + xvst U0, TD, 0x00 + + addi.d TS, TS, 0x20 // a_offset + addi.d TD, TD, 0x20 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d TS, TS, 0x08 // a_offset + addi.d TD, TD, 0x08 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_ncopy_2_lsx.S b/kernel/loongarch64/cgemm_ncopy_2_lsx.S new file mode 100644 index 000000000..1cf4d87dc --- /dev/null +++ b/kernel/loongarch64/cgemm_ncopy_2_lsx.S @@ -0,0 +1,202 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 +#define D8 $vr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + slli.d T0, TL, 0x01 + + srai.d I, N, 0x01 + beq I, ZERO, .L_N0 + +.L_J1: /* if (i > 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vand.v D0, U2, U2 + vand.v D1, U3, U3 + vand.v D2, U2, U2 + vand.v D3, U3, U3 + + vpermi.w D0, U0, 0x44 + vpermi.w D2, U0, 0xee + vpermi.w D1, U1, 0x44 + vpermi.w D3, U1, 0xee + + vst D0, TD, 0x00 + vst D2, TD, 0x10 + vst D1, TD, 0x20 + vst D3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset1 + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + vld U0, TS, 0x00 + vld U1, TS, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TS, TS, 0x20 // a_offset + addi.d TD, TD, 0x20 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d TS, TS, 0x08 // a_offset + addi.d TD, TD, 0x08 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_2_lasx.S b/kernel/loongarch64/cgemm_tcopy_2_lasx.S new file mode 100644 index 000000000..e2245e412 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_2_lasx.S @@ -0,0 +1,218 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x20 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x00 + xvld U2, S2, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U2, U1, 0x31 + + xvst U0, S8, 0x00 + + slli.d T0, M, 0x04 + add.d S8, S8, T0 + + xvst U2, S8, 0x00 + + add.d S8, S8, T0 + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S8, 0x00 + vst $vr1, S8, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + fst.s F2, S9, 0x08 + fst.s F3, S9, 0x0c + + addi.d S9, S9, 0x10 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld $vr0, TS, 0x00 + vld $vr1, TS, 0x10 + + vst $vr0, TD, 0x00 + + slli.d T0, M, 0x04 + add.d TD, TD, T0 + + vst $vr1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld $vr0, TS, 0x00 + + vst $vr0, TD, 0x00 + + addi.d TS, TS, 0x10 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_2_lsx.S b/kernel/loongarch64/cgemm_tcopy_2_lsx.S new file mode 100644 index 000000000..15c0fde8f --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_2_lsx.S @@ -0,0 +1,218 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x20 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S8, 0x00 + vst U2, S8, 0x10 + + slli.d T0, M, 0x04 + add.d S8, S8, T0 + + vst U1, S8, 0x00 + vst U3, S8, 0x10 + + add.d S8, S8, T0 + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + fst.s F2, S9, 0x08 + fst.s F3, S9, 0x0c + + addi.d S9, S9, 0x10 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, TS, 0x00 + vld U1, TS, 0x10 + + vst U0, TD, 0x00 + + slli.d T0, M, 0x04 + add.d TD, TD, T0 + + vst U1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, TS, 0x00 + + vst U0, TD, 0x00 + + addi.d TS, TS, 0x10 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cnrm2_lasx.S b/kernel/loongarch64/cnrm2_lasx.S new file mode 100644 index 000000000..3a60069ac --- /dev/null +++ b/kernel/loongarch64/cnrm2_lasx.S @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define a1 $f15 +#define a2 $f16 +#define res $f19 +#define VX0 $xr15 +#define VX1 $xr16 +#define VX2 $xr17 +#define VX3 $xr18 +#define VX4 $xr21 +#define res1 $xr19 +#define res2 $xr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + .align 3 + b .L996 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s a1, a1 + fcvt.d.s a2, a2 + fmadd.d res, a1, a1, res + fmadd.d res, a2, a2, res + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d res, res + move $r4, $r17 + fcvt.s.d $f0, res + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/cnrm2_lsx.S b/kernel/loongarch64/cnrm2_lsx.S new file mode 100644 index 000000000..20950ba17 --- /dev/null +++ b/kernel/loongarch64/cnrm2_lsx.S @@ -0,0 +1,155 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define a1 $f15 +#define a2 $f16 +#define res $f19 +#define VX0 $vr15 +#define VX1 $vr16 +#define VX2 $vr17 +#define VX3 $vr18 +#define VX4 $vr21 +#define res1 $vr19 +#define res2 $vr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + vld VX0, X, 4 * SIZE + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L996 + .align 3 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s a1, a1 + fcvt.d.s a2, a2 + fmadd.d res, a1, a1, res + fmadd.d res, a2, a2, res + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d res, res + move $r4, $r17 + fcvt.s.d $f0, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/copy_lasx.S b/kernel/loongarch64/copy_lasx.S new file mode 100644 index 000000000..31f91cec1 --- /dev/null +++ b/kernel/loongarch64/copy_lasx.S @@ -0,0 +1,306 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 + addi.d I, I, -1 + xvst VX0, Y, 0 +#ifdef DOUBLE + xvld VX0, X, 32 + xvst VX0, Y, 32 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 3 + add.d Y, Y, INCY +#else + xvld VX0, X, 0 + xvstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 4 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 5 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 6 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 7 + add.d Y, Y, INCY +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvst VX0, Y, 0 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvst VX1, Y, 32 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvst VX0, Y, 0 +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +/* INCX!=1 and INCY!=1 */ +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/copy_lsx.S b/kernel/loongarch64/copy_lsx.S new file mode 100644 index 000000000..bb10f3565 --- /dev/null +++ b/kernel/loongarch64/copy_lsx.S @@ -0,0 +1,316 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + vst VX0, Y, 0 + vst VX1, Y, 16 +#ifdef DOUBLE + vld VX0, X, 32 + vld VX1, X, 48 + vst VX0, Y, 32 + vst VX1, Y, 48 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + vld VX0, X, 32 + vld VX1, X, 48 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY +#else + vld VX0, X, 0 + vld VX1, X, 16 + vstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 3 + add.d Y, Y, INCY +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 + vst VX1, Y, 16 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 32 + vst VX1, Y, 48 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vst VX0, Y, 0 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX1, Y, 16 +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +/* INCX!=1 and INCY!=1 */ +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/crot_lasx.S b/kernel/loongarch64/crot_lasx.S new file mode 100644 index 000000000..d4ec1e22c --- /dev/null +++ b/kernel/loongarch64/crot_lasx.S @@ -0,0 +1,1079 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VT0 $xr10 +#define VT1 $xr18 +#define VXC $xr23 +#define VXS $xr9 +#define VXZ $xr11 +#define x1 $xr12 +#define x2 $xr13 +#define x3 $xr14 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, C + MTG t2, S + MTG t3, a1 +#ifdef DOUBLE + xvreplgr2vr.d VXC, t1 + xvreplgr2vr.d VXS, t2 + xvreplgr2vr.d VXZ, t3 + srai.d I, N, 2 +#else + xvreplgr2vr.w VXC, t1 + xvreplgr2vr.w VXS, t2 + xvreplgr2vr.w VXZ, t3 + srai.d I, N, 3 +#endif + beq INCX, $r0, .L996 + beq INCY, $r0, .L996 + bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1 + bne INCY, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x1, VXC + xvfmadd.d VX0, x3, VXS, VX0 + xvfmul.d VX1, x1, VXS + xvfmsub.d VX1, x3, VXC, VX1 + xvfmul.d VX2, x2, VXC + xvfmadd.d VX2, x4, VXS, VX2 + xvfmul.d VX3, x2, VXS + xvfmsub.d VX3, x4, VXC, VX3 + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x1, VXC + xvfmadd.s VX0, x3, VXS, VX0 + xvfmul.s VX1, x1, VXS + xvfmsub.s VX1, x3, VXC, VX1 + xvfmul.s VX2, x2, VXC + xvfmadd.s VX2, x4, VXS, VX2 + xvfmul.s VX3, x2, VXS + xvfmsub.s VX3, x4, VXC, VX3 + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x1, VXC + xvfmul.d VX1, x3, VXC + xvfmul.d VX2, x2, VXC + xvfmul.d VX3, x4, VXC + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x1, VXC + xvfmul.s VX1, x3, VXC + xvfmul.s VX2, x2, VXC + xvfmul.s VX3, x4, VXC + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x3, VXS + xvfmul.d VX1, x1, VXS + xvfsub.d VX1, VXZ, VX1 + xvfmul.d VX2, x4, VXS + xvfmul.d VX3, x2, VXS + xvfsub.d VX3, VXZ, VX3 + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x3, VXS + xvfmul.s VX1, x1, VXS + xvfsub.s VX1, VXZ, VX1 + xvfmul.s VX2, x4, VXS + xvfmul.s VX3, x2, VXS + xvfsub.s VX3, VXZ, VX3 + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + xvst VXZ, X, 0 * SIZE + xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 4 * SIZE + xvst VXZ, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvst VXZ, X, 8 * SIZE + xvst VXZ, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + + xvfmul.d VX0, x1, VXC + xvfmadd.d VX0, x3, VXS, VX0 + xvfmul.d VX1, x1, VXS + xvfmsub.d VX1, x3, VXC, VX1 + xvfmul.d VX2, x2, VXC + xvfmadd.d VX2, x4, VXS, VX2 + xvfmul.d VX3, x2, VXS + xvfmsub.d VX3, x4, VXC, VX3 + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + + xvfmul.s VX0, x1, VXC + xvfmadd.s VX0, x3, VXS, VX0 + xvfmul.s VX1, x1, VXS + xvfmsub.s VX1, x3, VXC, VX1 + xvfmul.s VX2, x2, VXC + xvfmadd.s VX2, x4, VXS, VX2 + xvfmul.s VX3, x2, VXS + xvfmsub.s VX3, x4, VXC, VX3 + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, x1, VXC + xvfmul.d VX1, x3, VXC + xvfmul.d VX2, x2, VXC + xvfmul.d VX3, x4, VXC + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, x1, VXC + xvfmul.s VX1, x3, VXC + xvfmul.s VX2, x2, VXC + xvfmul.s VX3, x4, VXC + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, x3, VXS + xvfmul.d VX1, x1, VXS + xvfsub.d VX1, VXZ, VX1 + xvfmul.d VX2, x4, VXS + xvfmul.d VX3, x2, VXS + xvfsub.d VX3, VXZ, VX3 + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, x3, VXS + xvfmul.s VX1, x1, VXS + xvfsub.s VX1, VXZ, VX1 + xvfmul.s VX2, x4, VXS + xvfmul.s VX3, x2, VXS + xvfsub.s VX3, VXZ, VX3 + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 +#else + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L997 + .align 3 + +.L996: + move I, N + b .L998 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MUL s1, a1, C + MADD s1, a3, S, s1 + MUL s2, a1, S + MSUB s2, a3, C, s2 + MUL s3, a2, C + MADD s3, a4, S, s3 + MUL s4, a2, S + MSUB s4, a4, C, s4 + addi.d I, I, -1 + ST s1, X, 0 * SIZE + ST s2, Y, 0 * SIZE + ST s3, X, 1 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/crot_lsx.S b/kernel/loongarch64/crot_lsx.S new file mode 100644 index 000000000..126257edc --- /dev/null +++ b/kernel/loongarch64/crot_lsx.S @@ -0,0 +1,907 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VT0 $vr10 +#define VT1 $vr18 +#define VXC $vr23 +#define VXS $vr9 +#define VXZ $vr11 +#define x1 $vr12 +#define x2 $vr13 +#define x3 $vr14 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, C + MTG t2, S + MTG t3, a1 +#ifdef DOUBLE + vreplgr2vr.d VXC, t1 + vreplgr2vr.d VXS, t2 + vreplgr2vr.d VXZ, t3 +#else + vreplgr2vr.w VXC, t1 + vreplgr2vr.w VXS, t2 + vreplgr2vr.w VXZ, t3 + srai.d I, N, 2 +#endif + beq INCX, $r0, .L996 + beq INCY, $r0, .L996 + bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1 + bne INCY, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x1, VXC + vfmadd.s VX0, x3, VXS, VX0 + vfmul.s VX1, x1, VXS + vfmsub.s VX1, x3, VXC, VX1 + vfmul.s VX2, x2, VXC + vfmadd.s VX2, x4, VXS, VX2 + vfmul.s VX3, x2, VXS + vfmsub.s VX3, x4, VXC, VX3 + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x1, VXC + vfmul.s VX1, x3, VXC + vfmul.s VX2, x2, VXC + vfmul.s VX3, x4, VXC + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x3, VXS + vfmul.s VX1, x1, VXS + vfsub.s VX1, VXZ, VX1 + vfmul.s VX2, x4, VXS + vfmul.s VX3, x2, VXS + vfsub.s VX3, VXZ, VX3 + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vst VXZ, X, 4 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: +#ifdef DOUBLE + srai.d I, N, 2 +#endif + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + vfmul.s VX0, x1, VXC + vfmadd.s VX0, x3, VXS, VX0 + vfmul.s VX1, x1, VXS + vfmsub.s VX1, x3, VXC, VX1 + vfmul.s VX2, x2, VXC + vfmadd.s VX2, x4, VXS, VX2 + vfmul.s VX3, x2, VXS + vfmsub.s VX3, x4, VXC, VX3 + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 +#endif + .align 3 + +.L222: // C!=0 S==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, x1, VXC + vfmul.s VX1, x3, VXC + vfmul.s VX2, x2, VXC + vfmul.s VX3, x4, VXC + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 +#endif + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, x3, VXS + vfmul.s VX1, x1, VXS + vfsub.s VX1, VXZ, VX1 + vfmul.s VX2, x4, VXS + vfmul.s VX3, x2, VXS + vfsub.s VX3, VXZ, VX3 + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 +#endif + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L995 +#else + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L997 +#endif + .align 3 + +#ifdef DOUBLE + .L995: + andi I, N, 3 + bge $r0, I, .L999 + b .L998 + .align 3 + +#endif +.L996: + move I, N + b .L998 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 1 +#else + andi I, N, 3 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MUL s1, a1, C + MADD s1, a3, S, s1 + MUL s2, a1, S + MSUB s2, a3, C, s2 + MUL s3, a2, C + MADD s3, a4, S, s3 + MUL s4, a2, S + MSUB s4, a4, C, s4 + addi.d I, I, -1 + ST s1, X, 0 * SIZE + ST s2, Y, 0 * SIZE + ST s3, X, 1 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S new file mode 100644 index 000000000..3605a6c0e --- /dev/null +++ b/kernel/loongarch64/cscal_lasx.S @@ -0,0 +1,645 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXAR $xr23 +#define VXAI $xr19 +#define VXZ $xr12 +#define x1 $xr18 +#define x2 $xr17 +#define x3 $xr16 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + MTG t1, ALPHAR +#ifdef DOUBLE + xvreplgr2vr.d VXAR, t1 + movfr2gr.d t2, ALPHAI + xvreplgr2vr.d VXAI, t2 + xvxor.v VXZ, VXZ, VXZ + srai.d I, N, 2 +#else + xvreplgr2vr.w VXAR, t1 + movfr2gr.s t2, ALPHAI + xvreplgr2vr.w VXAI, t2 + xvxor.v VXZ, VXZ, VXZ + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 + b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + +.L14: + bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 + b .L111 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L111: //alpha_r == 0.0 && alpha_i == 0.0 + xvst VXZ, X, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvst VXZ, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: //alpha_r == 0.0 && alpha_i != 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VXAI, x2 + xvfsub.d x3, VXZ, x3 + xvfmul.d x4, VXAI, x1 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VXAI, x2 + xvfsub.s x3, VXZ, x3 + xvfmul.s x4, VXAI, x1 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: //alpha_r != 0.0 && alpha_i == 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VXAR, x1 + xvfmul.d x4, VXAR, x2 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VXAR, x1 + xvfmul.s x4, VXAR, x2 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: //alpha_r != 0.0 && alpha_i != 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d VX0, VXAI, x2 + xvfmsub.d x3, VXAR, x1, VX0 + xvfmul.d VX1, VXAI, x1 + xvfmadd.d x4, VXAR, x2, VX1 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s VX0, VXAI, x2 + xvfmsub.s x3, VXAR, x1, VX0 + xvfmul.s VX1, VXAI, x1 + xvfmadd.s x4, VXAR, x2, VX1 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 + b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + +.L24: + bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 + b .L221 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L221: //alpha_r == 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 +#else + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 +#endif + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: //alpha_r == 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d x3, VXAI, x2 + xvfsub.d x3, VXZ, x3 + xvfmul.d x4, VXAI, x1 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s x3, VXAI, x2 + xvfsub.s x3, VXZ, x3 + xvfmul.s x4, VXAI, x1 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: //alpha_r != 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d x3, VXAR, x1 + xvfmul.d x4, VXAR, x2 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s x3, VXAR, x1 + xvfmul.s x4, VXAR, x2 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: //alpha_r != 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d VX0, VXAI, x2 + xvfmsub.d x3, VXAR, x1, VX0 + xvfmul.d VX1, VXAI, x1 + xvfmadd.d x4, VXAR, x2, VX1 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s VX0, VXAI, x2 + xvfmsub.s x3, VXAR, x1, VX0 + xvfmul.s VX1, VXAI, x1 + xvfmadd.s x4, VXAR, x2, VX1 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s1, ALPHAR, a1, s1 + MADD s2, ALPHAR, a2, s2 + ST s1, X, 0 * SIZE + ST s2, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cscal_lsx.S b/kernel/loongarch64/cscal_lsx.S new file mode 100644 index 000000000..f442a754f --- /dev/null +++ b/kernel/loongarch64/cscal_lsx.S @@ -0,0 +1,571 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXAR $vr23 +#define VXAI $vr19 +#define VXZ $vr12 +#define x1 $vr18 +#define x2 $vr17 +#define x3 $vr16 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + MTG t1, ALPHAR +#ifdef DOUBLE + vreplgr2vr.d VXAR, t1 + movfr2gr.d t2, ALPHAI + vreplgr2vr.d VXAI, t2 +#else + vreplgr2vr.w VXAR, t1 + movfr2gr.s t2, ALPHAI + vreplgr2vr.w VXAI, t2 +#endif + vxor.v VXZ, VXZ, VXZ + srai.d I, N, 2 + bne INCX, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 + b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + +.L14: + bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 + b .L111 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L111: //alpha_r == 0.0 && alpha_i == 0.0 + vst VXZ, X, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, X, 4 * SIZE + vst VXZ, X, 6 * SIZE +#else + vst VXZ, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: //alpha_r == 0.0 && alpha_i != 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXAI, x2 + vfsub.s x3, VXZ, x3 + vfmul.s x4, VXAI, x1 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: //alpha_r != 0.0 && alpha_i == 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXAR, x1 + vfmul.s x4, VXAR, x2 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: //alpha_r != 0.0 && alpha_i != 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s VX0, VXAI, x2 + vfmsub.s x3, VXAR, x1, VX0 + vfmul.s VX1, VXAI, x1 + vfmadd.s x4, VXAR, x2, VX1 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 + b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + +.L24: + bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 + b .L221 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L221: //alpha_r == 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 +#else + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 +#endif + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: //alpha_r == 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s x3, VXAI, x2 + vfsub.s x3, VXZ, x3 + vfmul.s x4, VXAI, x1 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: //alpha_r != 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s x3, VXAR, x1 + vfmul.s x4, VXAR, x2 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: //alpha_r != 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s VX0, VXAI, x2 + vfmsub.s x3, VXAR, x1, VX0 + vfmul.s VX1, VXAI, x1 + vfmadd.s x4, VXAR, x2, VX1 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s1, ALPHAR, a1, s1 + MADD s2, ALPHAR, a2, s2 + ST s1, X, 0 * SIZE + ST s2, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/csum_lasx.S b/kernel/loongarch64/csum_lasx.S new file mode 100644 index 000000000..3e65f2c15 --- /dev/null +++ b/kernel/loongarch64/csum_lasx.S @@ -0,0 +1,274 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvfadd.d res2, VX2, VX3 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/csum_lsx.S b/kernel/loongarch64/csum_lsx.S new file mode 100644 index 000000000..8de8e27ca --- /dev/null +++ b/kernel/loongarch64/csum_lsx.S @@ -0,0 +1,266 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfadd.s res2, VX0, VX1 + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfadd.s res2, VX0, VX1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cswap_lasx.S b/kernel/loongarch64/cswap_lasx.S new file mode 100644 index 000000000..d53773d5a --- /dev/null +++ b/kernel/loongarch64/cswap_lasx.S @@ -0,0 +1,394 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvst VX2, X, 0 * SIZE + xvst VX0, Y, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + xvstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 2 + ld.d t4, Y, 1 * SIZE + xvstelm.d VX0, Y, 1 * SIZE, 3 + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvst VX2, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + xvstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + xvstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + xvstelm.d VX1, Y, 0 * SIZE, 2 + ld.d t4, Y, 1 * SIZE + xvstelm.d VX1, Y, 1 * SIZE, 3 + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvst VX3, X, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 3 + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 4 + ld.w t2, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 6 + ld.w t4, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 7 + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvst VX2, X, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + xvstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + xvstelm.d VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + xvstelm.d VX2, X, 0 * SIZE, 2 + ld.d t4, X, 1 * SIZE + xvstelm.d VX2, X, 1 * SIZE, 3 + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvst VX0, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + xvstelm.d VX3, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + xvstelm.d VX3, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + xvstelm.d VX3, X, 0 * SIZE, 2 + ld.d t4, X, 1 * SIZE + xvstelm.d VX3, X, 1 * SIZE, 3 + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvst VX1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 3 + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 4 + ld.w t2, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 5 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 6 + ld.w t4, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 7 + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvst VX0, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + ST b1, XX, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + LD a3, X, 0 * SIZE + ST b3, XX, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + ST b1, XX, 0 * SIZE + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + ST b3, XX, 0 * SIZE + ST b4, XX, 1 * SIZE + + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cswap_lsx.S b/kernel/loongarch64/cswap_lsx.S new file mode 100644 index 000000000..62a869066 --- /dev/null +++ b/kernel/loongarch64/cswap_lsx.S @@ -0,0 +1,421 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE + vst VX0, Y, 4 * SIZE + vst VX1, Y, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + vstelm.d VX0, Y, 1 * SIZE, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + vstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t4, Y, 1 * SIZE + vstelm.d VX1, Y, 1 * SIZE, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + vstelm.d VX0, Y, 1 * SIZE, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + vstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t4, Y, 1 * SIZE + vstelm.d VX1, Y, 1 * SIZE, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + vstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + vstelm.w VX0, Y, 1 * SIZE, 3 + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vst VX2, X, 0 * SIZE + + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + vstelm.w VX1, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + vstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + vstelm.w VX1, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + vstelm.w VX1, Y, 1 * SIZE, 3 + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + vstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + vstelm.d VX2, X, 1 * SIZE, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + vstelm.d VX3, X, 0 * SIZE, 0 + ld.d t4, X, 1 * SIZE + vstelm.d VX3, X, 1 * SIZE, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + vstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + vstelm.d VX2, X, 1 * SIZE, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + vstelm.d VX3, X, 0 * SIZE, 0 + ld.d t4, X, 1 * SIZE + vstelm.d VX3, X, 1 * SIZE, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + vstelm.w VX2, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + vstelm.w VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + vstelm.w VX2, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + vstelm.w VX2, X, 1 * SIZE, 3 + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vst VX0, Y, 0 * SIZE + + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + vstelm.w VX3, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + vstelm.w VX3, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + vstelm.w VX3, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + vstelm.w VX3, X, 1 * SIZE, 3 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vst VX1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + ST b1, XX, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + LD a3, X, 0 * SIZE + ST b3, XX, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + ST b1, XX, 0 * SIZE + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + ST b3, XX, 0 * SIZE + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_kernel_8x4.S b/kernel/loongarch64/dgemm_kernel_8x4.S new file mode 100644 index 000000000..405f1bd97 --- /dev/null +++ b/kernel/loongarch64/dgemm_kernel_8x4.S @@ -0,0 +1,2894 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA $f0 // param 4: alpha +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc + +#ifdef TRMMKERNEL +#define OFFSET $r11 // param 9: offset +#endif +#define OFF $r12 + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define T0 $r25 /* !! DO NOT USE $r21 and $r22 !! */ +#define T1 $r26 +#define T2 $r27 +#define ZERO $r0 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 +#define D0 $vr16 +#define D1 $vr17 +#define D2 $vr18 +#define D3 $vr19 +#define D4 $vr20 +#define D5 $vr21 +#define D6 $vr22 +#define D7 $vr23 +#define D8 $vr24 +#define D9 $vr25 +#define D10 $vr26 +#define D11 $vr27 +#define D12 $vr28 +#define D13 $vr29 +#define D14 $vr30 +#define D15 $vr31 +#define VALPHA $vr15 + +/* Prefetch interval */ +#define A_PRE 0x200 +#define B_PRE 0x100 + +.macro KERNEL2x8x4 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + preld 0, B0, B_PRE + vldrepl.d U5, B0, 0x08 + vfmadd.d D10, U10, U14, D10 + vfmadd.d D11, U11, U14, D11 + + preld 0, A0, A_PRE + vldrepl.d U6, B0, 0x10 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + preld 0, A0, A_PRE + 0x40 + vldrepl.d U7, B0, 0x18 + vfmadd.d D14, U10, U15, D14 + vfmadd.d D15, U11, U15, D15 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + vld U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vld U9, A0, 0x10 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vld U10, A0, 0x20 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vld U11, A0, 0x30 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vldrepl.d U12, B0, 0x00 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + preld 0, B0, B_PRE + vldrepl.d U13, B0, 0x08 + vfmadd.d D10, U2, U6, D10 + vfmadd.d D11, U3, U6, D11 + + preld 0, A0, A_PRE + vldrepl.d U14, B0, 0x10 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + preld 0, A0, A_PRE + 0x40 + vldrepl.d U15, B0, 0x18 + vfmadd.d D14, U2, U7, D14 + vfmadd.d D15, U3, U7, D15 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x8x4_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + preld 0, B0, B_PRE + vldrepl.d U5, B0, 0x08 + vfmadd.d D10, U10, U14, D10 + vfmadd.d D11, U11, U14, D11 + + preld 0, A0, A_PRE + vldrepl.d U6, B0, 0x10 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + preld 0, A0, A_PRE + 0x40 + vldrepl.d U7, B0, 0x18 + vfmadd.d D14, U10, U15, D14 + vfmadd.d D15, U11, U15, D15 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + preld 0, B0, B_PRE + vfmadd.d D10, U2, U6, D10 + vfmadd.d D11, U3, U6, D11 + + preld 0, A0, A_PRE + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + preld 0, A0, A_PRE + 0x40 + vfmadd.d D14, U2, U7, D14 + vfmadd.d D15, U3, U7, D15 +.endm + +.macro KERNEL8x8x4 +.rept 4 + KERNEL2x8x4 +.endr +.endm + +.macro KERNEL8x8x4_END +.rept 3 + KERNEL2x8x4 +.endr + KERNEL2x8x4_END +.endm + +.macro KERNEL2x4x4 + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + vldrepl.d U12, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U13, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vldrepl.d U14, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + vldrepl.d U15, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x4x4_END + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 +.endm + +.macro KERNEL8x4x4 +.rept 4 + KERNEL2x4x4 +.endr +.endm + +.macro KERNEL8x4x4_END +.rept 3 + KERNEL2x4x4 +.endr + KERNEL2x4x4_END +.endm + +.macro KERNEL2x2x4 + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vfmadd.d D2, U9, U12, D2 + vfmadd.d D3, U9, U13, D3 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + vldrepl.d U8, A0, 0x00 + vldrepl.d U9, A0, 0x08 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vfmadd.d D2, U1, U4, D2 + vfmadd.d D3, U1, U5, D3 + + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x2x4_END + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vfmadd.d D2, U9, U12, D2 + vfmadd.d D3, U9, U13, D3 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vfmadd.d D2, U1, U4, D2 + vfmadd.d D3, U1, U5, D3 +.endm + +.macro KERNEL8x2x4 +.rept 4 + KERNEL2x2x4 +.endr +.endm + +.macro KERNEL8x2x4_END +.rept 3 + KERNEL2x2x4 +.endr + KERNEL2x2x4_END +.endm + +.macro KERNEL2x1x4 + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vldrepl.d U8, A0, 0x08 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vld U12, B0, 0x20 + vld U13, B0, 0x30 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x40 +.endm + +.macro KERNEL2x1x4_END + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 +.endm + +.macro KERNEL8x1x4 +.rept 4 + KERNEL2x1x4 +.endr +.endm + +.macro KERNEL8x1x4_END +.rept 3 + KERNEL2x1x4 +.endr + KERNEL2x1x4_END +.endm + +.macro KERNEL2x8x2 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + vld U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vld U9, A0, 0x10 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vld U10, A0, 0x20 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vld U11, A0, 0x30 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x8x2_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 +.endm + +.macro KERNEL8x8x2 +.rept 4 + KERNEL2x8x2 +.endr +.endm + +.macro KERNEL8x8x2_END +.rept 3 + KERNEL2x8x2 +.endr + KERNEL2x8x2_END +.endm + +.macro KERNEL2x4x2 + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + vld U8, A0, 0x20 + vld U9, A0, 0x30 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vldrepl.d U12, B0, 0x10 + vldrepl.d U13, B0, 0x18 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x4x2_END + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 +.endm + +.macro KERNEL8x4x2 +.rept 4 + KERNEL2x4x2 +.endr +.endm + +.macro KERNEL8x4x2_END +.rept 3 + KERNEL2x4x2 +.endr + KERNEL2x4x2_END +.endm + +.macro KERNEL2x2x2 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D4, U8, U13, D4 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + vld U8, A0, 0x10 + vldrepl.d U12, B0, 0x10 + vldrepl.d U13, B0, 0x18 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D4, U0, U5, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x2x2_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D4, U8, U13, D4 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D4, U0, U5, D4 +.endm + +.macro KERNEL8x2x2 +.rept 4 + KERNEL2x2x2 +.endr +.endm + +.macro KERNEL8x2x2_END +.rept 3 + KERNEL2x2x2 +.endr + KERNEL2x2x2_END +.endm + +.macro KERNEL2x1x2 + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + + vld U4, B0, 0x00 + vldrepl.d U8, A0, 0x08 + vld U12, B0, 0x10 + vfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x1x2_END + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + + vld U4, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x2 +.rept 4 + KERNEL2x1x2 +.endr +.endm + +.macro KERNEL8x1x2_END +.rept 3 + KERNEL2x1x2 +.endr + KERNEL2x1x2_END +.endm + +.macro KERNEL2x8x1 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vldrepl.d U4, B0, 0x00 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vld U8, A0, 0x40 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vld U9, A0, 0x50 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vld U10, A0, 0x60 + vld U11, A0, 0x70 + + vldrepl.d U12, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x8x1_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 +.endm + +.macro KERNEL8x8x1 +.rept 4 + KERNEL2x8x1 +.endr +.endm + +.macro KERNEL8x8x1_END +.rept 3 + KERNEL2x8x1 +.endr + KERNEL2x8x1_END +.endm + +.macro KERNEL2x4x1 + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vldrepl.d U4, B0, 0x00 + + vld U8, A0, 0x20 + vld U9, A0, 0x30 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vldrepl.d U12, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x4x1_END + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 +.endm + +.macro KERNEL8x4x1 +.rept 4 + KERNEL2x4x1 +.endr +.endm + +.macro KERNEL8x4x1_END +.rept 3 + KERNEL2x4x1 +.endr + KERNEL2x4x1_END +.endm + +.macro KERNEL2x2x1 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + vld U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x2x1_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x2x1 +.rept 4 + KERNEL2x2x1 +.endr +.endm + +.macro KERNEL8x2x1_END +.rept 3 + KERNEL2x2x1 +.endr + KERNEL2x2x1_END +.endm + +.macro KERNEL2x1x1 + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + vldrepl.d U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x1x1_END + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x1 +.rept 4 + KERNEL2x1x1 +.endr +.endm + +.macro KERNEL8x1x1_END +.rept 3 + KERNEL2x1x1 +.endr + KERNEL2x1x1_END +.endm + + + PROLOGUE + + addi.d $sp, $sp, -112 + /* Store regs */ + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f24, $sp, 40 + ST $f25, $sp, 48 + ST $f26, $sp, 56 + ST $f27, $sp, 64 + ST $f28, $sp, 72 + ST $f29, $sp, 80 + ST $f30, $sp, 88 + ST $f31, $sp, 96 + ST ALPHA, $sp, 104 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, ZERO, OFFSET +#else + xor OFF, OFF, OFF +#endif + + /* if (!(N >> 2)) goto L_N3 */ + srai.d J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + vldrepl.d VALPHA, $sp, 104 /* When N < 4, VALPHA will not changed */ + beq ZERO, J, .L_N3 + +.L_J1: /* J-- && This loop include Condition 1 */ + +/************************* Condition 1 if((N >> 2) && (M >> 3)) START !!! ************************* +* dgemm_core_16x4 */ + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + addi.d J, J, -1 /* J-- */ + add.d C2, C1, T0 + add.d C3, C2, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_M8 + +.L_I1: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + /* Calculate the first set of D0~D15, + * avoidig set 0 operation + * Load 8 * 64 from A0 + * U0 = {a1, a0} + * U1 = {a3, a2} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + preld 0, C0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + preld 0, C0, 0x20 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + vldrepl.d U5, B0, 0x08 + preld 0, C1, 0x00 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + preld 0, C1, 0x20 + vfmul.d D6, U2, U5 + vfmul.d D7, U3, U5 + + vldrepl.d U6, B0, 0x10 + preld 0, C2, 0x00 + /* line 3 */ + vfmul.d D8, U0, U6 + vfmul.d D9, U1, U6 + preld 0, C2, 0x20 + vfmul.d D10, U2, U6 + vfmul.d D11, U3, U6 + + vldrepl.d U7, B0, 0x18 + preld 0, C3, 0x00 + /* line 4 */ + vfmul.d D12, U0, U7 + vfmul.d D13, U1, U7 + preld 0, C3, 0x20 + vfmul.d D14, U2, U7 + vfmul.d D15, U3, U7 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_L7 */ + beq ZERO,TL, .L_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + vldrepl.d U14, B0, 0x10 + vldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_TL1_END +.L_TL1: /* TL-- */ + KERNEL8x8x4 + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_TL1 + +.L_TL1_END: + KERNEL8x8x4_END + + /* Maybe we need calculate the last + * 7 sets of D0~D15? + */ +.L_L7: + /* if (!(L & 7)) goto L_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_L0 + +.L_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + /* Cumulative D0~D15 */ + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + vfmadd.d D10, U2, U6, D10 + vfmadd.d D11, U3, U6, D11 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + vfmadd.d D14, U2, U7, D14 + vfmadd.d D15, U3, U7, D15 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_L71 + +.L_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D6, D6, VALPHA + vfmul.d D7, D7, VALPHA + vfmul.d D8, D8, VALPHA + vfmul.d D9, D9, VALPHA + vfmul.d D10, D10, VALPHA + vfmul.d D11, D11, VALPHA + vfmul.d D12, D12, VALPHA + vfmul.d D13, D13, VALPHA + vfmul.d D14, D14, VALPHA + vfmul.d D15, D15, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + vld U4, C1, 0x00 + vld U5, C1, 0x10 + vld U6, C1, 0x20 + vld U7, C1, 0x30 + vfmadd.d D4, D4, VALPHA, U4 + vfmadd.d D5, D5, VALPHA, U5 + vfmadd.d D6, D6, VALPHA, U6 + vfmadd.d D7, D7, VALPHA, U7 + + /* Load C2 */ + vld U8, C2, 0x00 + vld U9, C2, 0x10 + vld U10, C2, 0x20 + vld U11, C2, 0x30 + vfmadd.d D8, D8, VALPHA, U8 + vfmadd.d D9, D9, VALPHA, U9 + vfmadd.d D10, D10, VALPHA, U10 + vfmadd.d D11, D11, VALPHA, U11 + + /* Load C3 */ + vld U0, C3, 0x00 + vld U1, C3, 0x10 + vld U2, C3, 0x20 + vld U3, C3, 0x30 + vfmadd.d D12, D12, VALPHA, U0 + vfmadd.d D13, D13, VALPHA, U1 + vfmadd.d D14, D14, VALPHA, U2 + vfmadd.d D15, D15, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + vst D6, C1, 0x20 + vst D7, C1, 0x30 + /* Store C2 */ + vst D8, C2, 0x00 + vst D9, C2, 0x10 + vst D10, C2, 0x20 + vst D11, C2, 0x30 + /* Store C3 */ + vst D12, C3, 0x00 + vst D13, C3, 0x10 + vst D14, C3, 0x20 + vst D15, C3, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + addi.d C2, C2, 0x40 + addi.d C3, C3, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -8 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_I1 + +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_M0 + + andi I, M, 4 + beq ZERO,I, .L_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + + vldrepl.d U6, B0, 0x10 + /* line 3 */ + vfmul.d D8, U0, U6 + vfmul.d D9, U1, U6 + + vldrepl.d U7, B0, 0x18 + /* line 4 */ + vfmul.d D12, U0, U7 + vfmul.d D13, U1, U7 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + vldrepl.d U14, B0, 0x10 + vldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M4_TL1_END + +.L_M4_TL1: /* TL-- */ + KERNEL8x4x4 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_TL1 + +.L_M4_TL1_END: + KERNEL8x4x4_END + +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 + +.L_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_L71 + +.L_M4_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D8, D8, VALPHA + vfmul.d D9, D9, VALPHA + vfmul.d D12, D12, VALPHA + vfmul.d D13, D13, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + vld U2, C1, 0x00 + vld U3, C1, 0x10 + vfmadd.d D4, D4, VALPHA, U2 + vfmadd.d D5, D5, VALPHA, U3 + + /* Load C2 */ + vld U4, C2, 0x00 + vld U5, C2, 0x10 + vfmadd.d D8, D8, VALPHA, U4 + vfmadd.d D9, D9, VALPHA, U5 + + /* Load C3 */ + vld U6, C3, 0x00 + vld U7, C3, 0x10 + vfmadd.d D12, D12, VALPHA, U6 + vfmadd.d D13, D13, VALPHA, U7 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + /* Store C2 */ + vst D8, C2, 0x00 + vst D9, C2, 0x10 + /* Store C3 */ + vst D12, C3, 0x00 + vst D13, C3, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + addi.d C2, C2, 0x20 + addi.d C3, C3, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -4 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ + +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vfmul.d D0, U0, U4 + vfmul.d D1, U0, U5 + vfmul.d D2, U1, U4 + vfmul.d D3, U1, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 + + vldrepl.d U8, A0, 0x00 + vldrepl.d U9, A0, 0x08 + + addi.d TL, TL, -1 + + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M2_TL1_END +.L_M2_TL1: /* TL-- */ + KERNEL8x2x4 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 +.L_M2_TL1_END: + KERNEL8x2x4_END + +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 + +.L_M2_L71: + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vfmadd.d D2, U1, U4, D2 + vfmadd.d D3, U1, U5, D3 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M2_L71 + +.L_M2_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + vstelm.d D1, C2, 0x00, 0x00 + vstelm.d D1, C3, 0x00, 0x01 + vstelm.d D2, C0, 0x08, 0x00 + vstelm.d D2, C1, 0x08, 0x01 + vstelm.d D3, C2, 0x08, 0x00 + vstelm.d D3, C3, 0x08, 0x01 +#else + /* Load C0 */ + vld U0, C0, 0x00 + /* Load C1 */ + vld U1, C1, 0x00 + /* Load C2 */ + vld U2, C2, 0x00 + /* Load C3 */ + vld U3, C3, 0x00 + + vilvl.d D4, D2, D0 //C0 + vilvh.d D5, D2, D0 //C1 + vilvl.d D6, D3, D1 //C2 + vilvh.d D7, D3, D1 //C3 + + vfmadd.d D0, D4, VALPHA, U0 + vfmadd.d D2, D5, VALPHA, U1 + vfmadd.d D1, D6, VALPHA, U2 + vfmadd.d D3, D7, VALPHA, U3 + + vst D0, C0, 0x00 + vst D2, C1, 0x00 + vst D1, C2, 0x00 + vst D3, C3, 0x00 +#endif // #if defined(TRMMKERNEL) + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + addi.d C2, C2, 0x10 + addi.d C3, C3, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -2 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ + +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + vldrepl.d U0, A0, 0x00 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + vfmul.d D0, U0, U4 + vfmul.d D1, U0, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M1_TL1_END + +.L_M1_TL1: /* TL-- */ + KERNEL8x1x4 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M1_TL1 +.L_M1_TL1_END: + KERNEL8x1x4_END + +.L_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M1_L0 + +.L_M1_L71: + vldrepl.d U0, A0, 0x00 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M1_L71 + +.L_M1_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + vstelm.d D1, C2, 0x00, 0x00 + vstelm.d D1, C3, 0x00, 0x01 +#else + /* Load C0 */ + vldrepl.d U0, C0, 0x00 + vldrepl.d U1, C1, 0x00 + vilvl.d D4, U1, U0 + vfmadd.d D6, D0, VALPHA, D4 + + vldrepl.d U2, C2, 0x00 + vldrepl.d U3, C3, 0x00 + vilvl.d D5, U3, U2 + vfmadd.d D7, D1, VALPHA, D5 + + vstelm.d D6, C0, 0x00, 0x00 + vstelm.d D6, C1, 0x00, 0x01 + vstelm.d D7, C2, 0x00, 0x00 + vstelm.d D7, C3, 0x00, 0x01 +#endif // #if defined(TRMMKERNEL) + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + addi.d C2, C2, 0x08 + addi.d C3, C3, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -1 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 1) ) End************/ + +.L_M0: + /* Add stride for B and C + * B += (K * 32) + * C += (LDC * 32) + */ + /* since the array type is double, + * so we must mul 32 + */ + slli.d T0, K, 5 + slli.d T1, LDC, 5 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x04 +#endif + + blt ZERO, J, .L_J1 + +//////////////// go back to L_J1 ///////////////// +///////////////////////////////////////////////// +/************************ Condition 1 if((N >> 2) && (M >> 3)) END !!! ************************/ + + vldrepl.d VALPHA, $sp, 104 + +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 + +/************************* Condition 2 if((N & 2) && (M >> 3)) START !!! ************************* +* dgemm_core_16x2 */ + + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_N3_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_N3_M8 + +.L_N3_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 + * U0 = {a1, a0} + * U1 = {a3, a2} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + vfmul.d D6, U2, U5 + vfmul.d D7, U3, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_L7 */ + beq ZERO,TL, .L_N3_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_TL1_END + +.L_N3_TL1: /* TL-- */ + KERNEL8x8x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_TL1 +.L_N3_TL1_END: + KERNEL8x8x2_END + +.L_N3_L7: + /* if (!(L & 7)) goto L_N3_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_L0 + +.L_N3_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_L71 + +.L_N3_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D6, D6, VALPHA + vfmul.d D7, D7, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + vld U4, C1, 0x00 + vld U5, C1, 0x10 + vld U6, C1, 0x20 + vld U7, C1, 0x30 + vfmadd.d D4, D4, VALPHA, U4 + vfmadd.d D5, D5, VALPHA, U5 + vfmadd.d D6, D6, VALPHA, U6 + vfmadd.d D7, D7, VALPHA, U7 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + vst D6, C1, 0x20 + vst D7, C1, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x8 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N3_I1 + +.L_N3_M8: + /* We have done M & 8, considering M=4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_N3_M0 + + andi I, M, 4 + beq ZERO,I, .L_N3_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M4_L7 */ + beq ZERO,TL, .L_N3_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M4_TL1_END + +.L_N3_M4_TL1: /* TL-- */ + KERNEL8x4x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M4_TL1 +.L_N3_M4_TL1_END: + KERNEL8x4x2_END + +.L_N3_M4_L7: + /* if (!(L & 7)) goto L_N3_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M4_L0 + +.L_N3_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M4_L71 + +.L_N3_M4_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + vld U2, C1, 0x00 + vld U3, C1, 0x10 + vfmadd.d D4, D4, VALPHA, U2 + vfmadd.d D5, D5, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 4) ) End************/ + +.L_N3_M2: + andi I, M, 2 + beq ZERO,I, .L_N3_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + vldrepl.d U4, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M2_L7 */ + beq ZERO,TL, .L_N3_M2_L7 + + vld U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M2_TL1_END + +.L_N3_M2_TL1: /* TL-- */ + KERNEL8x2x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M2_TL1 +.L_N3_M2_TL1_END: + KERNEL8x2x2_END + +.L_N3_M2_L7: + /* if (!(L & 7)) goto L_N3_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M2_L0 + +.L_N3_M2_L71: + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + vfmadd.d D0, U0, U4, D0 + + vfmadd.d D4, U0, U5, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M2_L71 + +.L_N3_M2_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + vld U1, C1, 0x00 + vfmadd.d D4, D4, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + vst D0, C0, 0x00 + vst D4, C1, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 2) ) End************/ + +.L_N3_M1: + andi I, M, 1 + beq ZERO,I, .L_N3_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + + vld U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M1_L7 */ + beq ZERO,TL, .L_N3_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + + vld U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M1_TL1_END + +.L_N3_M1_TL1: /* TL-- */ + KERNEL8x1x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M1_TL1 +.L_N3_M1_TL1_END: + KERNEL8x1x2_END + +.L_N3_M1_L7: + /* if (!(L & 7)) goto L_N3_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M1_L0 + +.L_N3_M1_L71: + vldrepl.d U0, A0, 0x00 + + vld U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M1_L71 + +.L_N3_M1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C1, 0x00 + vilvl.d U2, U1, U0 + vfmadd.d D0, D0, VALPHA, U2 +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 1) ) End************/ + +.L_N3_M0: + /* Add stride for B and C + * B += (K * 16) + * C += (LDC * 16) + */ + /* since the array type is double, + * so we must mul 16 + */ + slli.d T0, K, 4 + slli.d T1, LDC, 4 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x02 +#endif + + /* We must reinit I */ + srai.d I, M, 4 /* I = bm >> 4 */ + +/************************* Condition 2 if((N & 2) && (M >> 3)) End !!! ************************* +* dgemm_core_16x2 */ + +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + +/************************* Condition 3 if((N & 1) && (M >> 3)) START !!! ************************* +* dgemm_core_16x1 */ + + move C0, C + move A0, A + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_N1_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_N1_M8 + +.L_N1_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 + * U0 = {a3, a2} + * U1 = {a1, a0} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_L7 */ + beq ZERO,TL, .L_N1_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_TL1_END +.L_N1_TL1: /* TL-- */ + KERNEL8x8x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_TL1 +.L_N1_TL1_END: + KERNEL8x8x1_END + +.L_N1_L7: + /* if (!(L & 7)) goto L_N1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_L0 + +.L_N1_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_L71 + +.L_N1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x8 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_I1 + +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_N1_M0 + + andi I, M, 4 + beq ZERO,I, .L_N1_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M4_L7 */ + beq ZERO,TL, .L_N1_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M4_TL1_END + +.L_N1_M4_TL1: /* TL-- */ + KERNEL8x4x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M4_TL1 +.L_N1_M4_TL1_END: + KERNEL8x4x1_END + +.L_N1_M4_L7: + /* if (!(L & 7)) goto L_N1_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M4_L0 + +.L_N1_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M4_L71 + +.L_N1_M4_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1) && (M & 4) ) End************/ + +.L_N1_M2: + andi I, M, 2 + beq ZERO,I, .L_N1_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M2_L7 */ + beq ZERO,TL, .L_N1_M2_L7 + + vld U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M2_TL1_END + +.L_N1_M2_TL1: /* TL-- */ + KERNEL8x2x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M2_TL1 +.L_N1_M2_TL1_END: + KERNEL8x2x1_END + +.L_N1_M2_L7: + /* if (!(L & 7)) goto L_N1_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M2_L0 + +.L_N1_M2_L71: + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M2_L71 + +.L_N1_M2_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C0, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 2) ) End************/ + +.L_N1_M1: + andi I, M, 1 + beq ZERO,I, .L_N1_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M1_TL1_END + +.L_N1_M1_TL1: /* TL-- */ + KERNEL8x1x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 +.L_N1_M1_TL1_END: + KERNEL8x1x1_END + +.L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M1_L0 + +.L_N1_M1_L71: + vldrepl.d U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M1_L71 + +.L_N1_M1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vldrepl.d U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 1) ) End************/ + +.L_N1_M0: + +/************************* Condition 3 if((N & 1) && (M >> 3)) End !!! ************************* +* dgemm_core_16x1 */ + +.L_N0: + /* Restore regs */ + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LD $f24, $sp, 40 + LD $f25, $sp, 48 + LD $f26, $sp, 56 + LD $f27, $sp, 64 + LD $f28, $sp, 72 + LD $f29, $sp, 80 + LD $f30, $sp, 88 + LD $f31, $sp, 96 + addi.d $sp, $sp, 112 + + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_4_lsx.S b/kernel/loongarch64/dgemm_ncopy_4_lsx.S new file mode 100644 index 000000000..048a49af6 --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_4_lsx.S @@ -0,0 +1,185 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r21 +#define TL $r7 +#define T0 $r6 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + PROLOGUE + + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x02 + beq J, ZERO, .L_N2 +.L_J1: /* J-- */ + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + addi.d J, J, -1 + beq I, ZERO, .L_I3 +.L_I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + addi.d TD, TD, 0x40 + + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_I0 +.L_II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_II1 +.L_I0: + blt ZERO, J, .L_J1 +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x01 + add.d TS, S2, TL + beq I, ZERO, .L_2I3 +.L_2I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00 + GINTERLACE v, d, D0, D1, U1, U0 + GST v, , D0, TD, 0x00, D1, TD, 0x10 + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_2I1 +.L_2I3: + andi I, M, 0x01 + beq ZERO, I, .L_N1 +.L_2II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fst.d F0, TD, 0x00 + addi.d I, I, -1 + fst.d F1, TD, 0x08 + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + blt ZERO, I, .L_2II1 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_M1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d TD, TD, 0x08 + addi.d M, M, -1 + blt ZERO, M, .L_M1 +.L_N0: + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_8_lsx.S b/kernel/loongarch64/dgemm_ncopy_8_lsx.S new file mode 100644 index 000000000..30bebe8df --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_8_lsx.S @@ -0,0 +1,283 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r21 +#define TL $r7 +#define T0 $r6 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + PROLOGUE + push_if_used 26, 32 + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x03 + beq J, ZERO, .L_N4 +.L_J1: + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x03 + add.d S3, S2, TL + addi.d J, J, -1 + add.d S4, S3, TL + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d TS, S7, T0 + beq I, ZERO, .L_I7 +.L_I1: + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00, \ + U4, S5, 0x00, U5, S6, 0x00, U6, S7, 0x00, U7, S8, 0x00 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10, \ + U4, S5, 0x10, U5, S6, 0x10, U6, S7, 0x10, U7, S8, 0x10 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x20, U1, S2, 0x20, U2, S3, 0x20, U3, S4, 0x20, \ + U4, S5, 0x20, U5, S6, 0x20, U6, S7, 0x20, U7, S8, 0x20 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x30, U1, S2, 0x30, U2, S3, 0x30, U3, S4, 0x30, \ + U4, S5, 0x30, U5, S6, 0x30, U6, S7, 0x30, U7, S8, 0x30 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_I7: + andi I, M, 0x07 + beq I, ZERO, .L_I0 +.L_II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + fst.d F4, TD, 0x20 + addi.d S5, S5, 0x08 + fst.d F5, TD, 0x28 + addi.d S6, S6, 0x08 + fst.d F6, TD, 0x30 + addi.d S7, S7, 0x08 + fst.d F7, TD, 0x38 + addi.d S8, S8, 0x08 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_II1 +.L_I0: + blt ZERO, J, .L_J1 +.L_N4: + andi J, N, 0x04 + beq ZERO, J, .L_N2 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + beq I, ZERO, .L_I3 +.L_4I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + addi.d TD, TD, 0x40 + + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_4I1 +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_N2 +.L_4II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_4II1 +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x01 + add.d TS, S2, TL + beq I, ZERO, .L_NI1 +.L_2I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00 + GINTERLACE v, d, D0, D1, U1, U0 + GST v, , D0, TD, 0x00, D1, TD, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_2I1 +.L_NI1: + andi I, M, 0x01 + beq I, ZERO, .L_N1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_M1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d TD, TD, 0x08 + addi.d M, M, -1 + blt ZERO, M, .L_M1 +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_4_lsx.S b/kernel/loongarch64/dgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..134066471 --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_4_lsx.S @@ -0,0 +1,280 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define P0 $r16 +#define P1 $r17 +#define P2 $r18 +#define P3 $r19 +#define T0 $r20 +#define T1 $r23 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 + + PROLOGUE + push_if_used 18, 8 + + move S0, SRC + move P0, DST + + // Find P0, P2, P3 + srai.d T0, N, 0x02 + slli.d T0, T0, 0x02 + srai.d T1, N, 0x01 + slli.d T1, T1, 0x01 + mul.d T0, M, T0 + mul.d T1, M, T1 + slli.d T0, T0, 0x03 + slli.d T1, T1, 0x03 + add.d P2, DST, T0 + add.d P3, DST, T1 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x02 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x05 + beq ZERO, J, .L_M3 +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x02 + addi.d J, J, -1 + beq ZERO, I, .L_N3 +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + add.d P1, P1, T1 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P2, P2, 0x40 +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + fst.d F2, P3, 0x10 + fst.d F3, P3, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P3, P3, 0x20 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_2N3 + +.L_2I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_2I1 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P2, P2, 0x20 + +.L_2N1: + addi.d I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P3, P3, 0x10 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + move P1, P0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_1N3 + +.L_1I1: + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + + addi.d S1, S1, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_1I1 + +.L_1N3: + andi I, N, 0x02 + beq I, ZERO, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P2, 0x00 + fst.d F1, P2, 0x08 + + addi.d S1, S1, 0x10 + addi.d P2, P2, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq I, ZERO, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P3, 0x00 + +.L_M0: + pop_if_used 18, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_8_lsx.S b/kernel/loongarch64/dgemm_tcopy_8_lsx.S new file mode 100644 index 000000000..a7e3ef69c --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_8_lsx.S @@ -0,0 +1,597 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define P5 $r27 +#define T0 $r28 +#define T1 $r29 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 + + PROLOGUE + push_if_used 24, 8 + + move S0, SRC + move P0, DST + + srai.d T0, N, 0x03 + srai.d T1, N, 0x02 + slli.d T0, T0, 0x03 + slli.d T1, T1, 0x02 + mul.d P2, M, T0 + mul.d P3, M, T1 + slli.d P2, P2, 0x03 + slli.d P3, P3, 0x03 + add.d P2, DST, P2 + add.d P3, DST, P3 + + srai.d T0, N, 0x01 + slli.d T0, T0, 0x01 + mul.d P4, M, T0 + slli.d P4, P4, 0x03 + add.d P4, DST, P4 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x03 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x06 + beq ZERO, J, .L_M7 +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d S0, S7, T0 + + move P1, P0 + addi.d P0, P0, 0x200 + + srai.d I, N, 0x03 + addi.d J, J, -1 + beq ZERO, I, .L_N7 + +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + vld U0, S3, 0x00 + vld U1, S3, 0x10 + vld U2, S3, 0x20 + vld U3, S3, 0x30 + vld U4, S4, 0x00 + vld U5, S4, 0x10 + vld U6, S4, 0x20 + vld U7, S4, 0x30 + + vst U0, P1, 0x80 + vst U1, P1, 0x90 + vst U2, P1, 0xa0 + vst U3, P1, 0xb0 + vst U4, P1, 0xc0 + vst U5, P1, 0xd0 + vst U6, P1, 0xe0 + vst U7, P1, 0xf0 + + vld U0, S5, 0x00 + vld U1, S5, 0x10 + vld U2, S5, 0x20 + vld U3, S5, 0x30 + vld U4, S6, 0x00 + vld U5, S6, 0x10 + vld U6, S6, 0x20 + vld U7, S6, 0x30 + + vst U0, P1, 0x100 + vst U1, P1, 0x110 + vst U2, P1, 0x120 + vst U3, P1, 0x130 + vst U4, P1, 0x140 + vst U5, P1, 0x150 + vst U6, P1, 0x160 + vst U7, P1, 0x170 + + vld U0, S7, 0x00 + vld U1, S7, 0x10 + vld U2, S7, 0x20 + vld U3, S7, 0x30 + vld U4, S8, 0x00 + vld U5, S8, 0x10 + vld U6, S8, 0x20 + vld U7, S8, 0x30 + + vst U0, P1, 0x180 + vst U1, P1, 0x190 + vst U2, P1, 0x1a0 + vst U3, P1, 0x1b0 + vst U4, P1, 0x1c0 + vst U5, P1, 0x1d0 + vst U6, P1, 0x1e0 + vst U7, P1, 0x1f0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_I1 +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + vst U4, P2, 0x40 + vst U5, P2, 0x50 + vst U6, P2, 0x60 + vst U7, P2, 0x70 + + vld U0, S5, 0x00 + vld U1, S5, 0x10 + vld U2, S6, 0x00 + vld U3, S6, 0x10 + vld U4, S7, 0x00 + vld U5, S7, 0x10 + vld U6, S8, 0x00 + vld U7, S8, 0x10 + + vst U0, P2, 0x80 + vst U1, P2, 0x90 + vst U2, P2, 0xa0 + vst U3, P2, 0xb0 + vst U4, P2, 0xc0 + vst U5, P2, 0xd0 + vst U6, P2, 0xe0 + vst U7, P2, 0xf0 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S5, S5, 0x20 + addi.d S6, S6, 0x20 + addi.d S7, S7, 0x20 + addi.d S8, S8, 0x20 + addi.d P2, P2, 0x100 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + vld U4, S5, 0x00 + vld U5, S6, 0x00 + vld U6, S7, 0x00 + vld U7, S8, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + vst U2, P3, 0x20 + vst U3, P3, 0x30 + vst U4, P3, 0x40 + vst U5, P3, 0x50 + vst U6, P3, 0x60 + vst U7, P3, 0x70 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d P3, P3, 0x80 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + fst.d F2, P4, 0x10 + fst.d F3, P4, 0x18 + fst.d F4, P4, 0x20 + fst.d F5, P4, 0x28 + + fst.d F6, P4, 0x30 + fst.d F7, P4, 0x38 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d P4, P4, 0x40 + +.L_N0: + blt ZERO, J, .L_J1 +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 + + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x100 + + srai.d I, N, 0x03 + beq ZERO, I, .L_4N7 +.L_4I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + vld U0, S3, 0x00 + vld U1, S3, 0x10 + vld U2, S3, 0x20 + vld U3, S3, 0x30 + vld U4, S4, 0x00 + vld U5, S4, 0x10 + vld U6, S4, 0x20 + vld U7, S4, 0x30 + + vst U0, P1, 0x80 + vst U1, P1, 0x90 + vst U2, P1, 0xa0 + vst U3, P1, 0xb0 + vst U4, P1, 0xc0 + vst U5, P1, 0xd0 + vst U6, P1, 0xe0 + vst U7, P1, 0xf0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_4I1 +.L_4N7: + andi I, N, 0x04 + beq ZERO, I, .L_4N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + vst U4, P2, 0x40 + vst U5, P2, 0x50 + vst U6, P2, 0x60 + vst U7, P2, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d P2, P2, 0x80 + +.L_4N3: + andi I, N, 0x02 + beq ZERO, I, .L_4N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + vst U2, P3, 0x20 + vst U3, P3, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P3, P3, 0x40 + +.L_4N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + fst.d F2, P4, 0x10 + fst.d F3, P4, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P4, P4, 0x20 +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x03 + beq ZERO, I, .L_2N7 +.L_2I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_2I1 +.L_2N7: + andi I, N, 0x04 + beq ZERO, I, .L_2N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d P2, P2, 0x40 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P3, P3, 0x20 + +.L_2N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P4, P4, 0x10 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + add.d S2, S0, TL + + move P1, P0 + addi.d P0, P0, 0x40 + + srai.d I, N, 0x03 + beq ZERO, I, .L_1N7 +.L_1I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + + addi.d S1, S1, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_1I1 + +.L_1N7: + andi I, N, 0x04 + beq ZERO, I, .L_1N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + + addi.d S1, S1, 0x20 + addi.d P2, P2, 0x20 + +.L_1N3: + andi I, N, 0x02 + beq ZERO, I, .L_1N1 + + vld U0, S1, 0x00 + vst U0, P3, 0x00 + + addi.d S1, S1, 0x10 + addi.d P3, P3, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P4, 0x00 + + addi.d S1, S1, 0x08 + addi.d P4, P4, 0x08 +.L_M0: + pop_if_used 24, 8 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/dnrm2_lasx.S b/kernel/loongarch64/dnrm2_lasx.S new file mode 100644 index 000000000..5a6f7cf1e --- /dev/null +++ b/kernel/loongarch64/dnrm2_lasx.S @@ -0,0 +1,257 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 + +/* Don't change following FR unless you know the effects. */ +#define VX0 $xr15 +#define VX1 $xr16 +#define VM0 $xr17 +#define VM1 $xr18 +#define VM2 $xr13 +#define VM3 $xr14 +#define res1 $xr19 +#define res2 $xr20 +#define VALPHA $xr21 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v VM0, VM0, VM0 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L97 + .align 3 + +.L10: + xvld VX0, X, 0 + xvld VX1, X, 4 * SIZE + xvfmaxa.d VM1, VX1, VX0 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L97 + .align 3 + +.L21: + ld.d t1, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t3, 2 + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX1, t2, 1 + ld.d t3, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX1, t3, 2 + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + xvpickve.d VX0, VM0, 1 + xvpickve.d VX1, VM0, 2 + xvpickve.d VM3, VM0, 3 + fmaxa.d $f17, $f17, $f14 + fmaxa.d $f17, $f17, $f15 + fmaxa.d $f17, $f17, $f16 + .align 3 + +.L97: + andi I, N, 7 + bge $r0, I, .L99 + .align 3 + +.L98: + xvld VX1, X, 0 + xvfmaxa.d VM0, VM0, VX1 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + xvreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, SIZE + bne INCX, TEMP, .L120 + srai.d I, N, 3 + bge $r0, I, .L997 + .align 3 + +.L110: + xvld VX0, XX, 0 + xvld VX1, XX, 4 * SIZE + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + srai.d I, N, 3 + bge $r0, I, .L997 + +.L121: + ld.d t1, XX, 0 + add.d XX, XX, INCX + ld.d t2, XX, 0 + add.d XX, XX, INCX + ld.d t3, XX, 0 + add.d XX, XX, INCX + ld.d t4, XX, 0 + add.d XX, XX, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, XX, 0 + add.d XX, XX, INCX + ld.d t2, XX, 0 + add.d XX, XX, INCX + ld.d t3, XX, 0 + add.d XX, XX, INCX + ld.d t4, XX, 0 + add.d XX, XX, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX0, res1, 1 + xvpickve.d VX1, res1, 2 + xvpickve.d VM2, res1, 3 + fadd.d $f19, $f19, $f15 + fadd.d $f19, $f19, $f16 + fadd.d $f19, $f19, $f13 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d $f15, XX, 0 + addi.d I, I, -1 + fmul.d $f15, $f15, ALPHA + fmadd.d $f19, $f15, $f15, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + +.L999: + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/dnrm2_lsx.S b/kernel/loongarch64/dnrm2_lsx.S new file mode 100644 index 000000000..fce4260e2 --- /dev/null +++ b/kernel/loongarch64/dnrm2_lsx.S @@ -0,0 +1,268 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 + +/* Don't change following FR unless you know the effects. */ +#define VX0 $vr15 +#define VX1 $vr16 +#define VM0 $vr17 +#define VM1 $vr18 +#define VM2 $vr13 +#define VM3 $vr14 +#define res1 $vr19 +#define res2 $vr20 +#define VALPHA $vr21 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v VM0, VM0, VM0 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + + bge $r0, I, .L97 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmaxa.d VM1, VX1, VX0 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmaxa.d VM2, VX1, VX0 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L97 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + vfmaxa.d VM0, VX0, VX1 + .align 3 + +.L97: + andi I, N, 7 + bge $r0, I, .L99 + .align 3 + +.L98: + vld VX1, X, 0 + vfmaxa.d VM0, VM0, VX1 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + vreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, SIZE + bne INCX, TEMP, .L120 + srai.d I, N, 3 + bge $r0, I, .L997 + .align 3 + +.L110: + vld VX0, XX, 0 * SIZE + vld VX1, XX, 2 * SIZE + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + vld VX0, XX, 4 * SIZE + vld VX1, XX, 6 * SIZE + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + srai.d I, N, 3 + bge $r0, I, .L997 + .align 3 + +.L121: + ld.d t1, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t2, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t4, XX, 0 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM2, VX0, VALPHA + ld.d t1, XX, 0 * SIZE + add.d XX, XX, INCX + vfmul.d VM3, VX1, VALPHA + ld.d t2, XX, 0 * SIZE + add.d XX, XX, INCX + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + ld.d t3, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t4, XX, 0 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d $f15, XX, 0 * SIZE + addi.d I, I, -1 + fmul.d $f15, $f15, ALPHA + fmadd.d $f19, $f15, $f15, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dot_lasx.S b/kernel/loongarch64/dot_lasx.S new file mode 100644 index 000000000..0715b6311 --- /dev/null +++ b/kernel/loongarch64/dot_lasx.S @@ -0,0 +1,368 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 + +#define I $r17 +#define TEMP $r18 + +/* Don't change following FR unless you know the effects. */ +#define s1 $f8 +#define s2 $f9 +#define a1 $f10 +#define b1 $f11 + +PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + /* init $f8 and $f9 to zero */ + SUB s1, s1, s1 + SUB s2, s2, s2 + slli.d INCX, INCX, BASE_SHIFT + li.d TEMP, SIZE + slli.d INCY, INCY, BASE_SHIFT + bge $r0, N, .L999 + bne INCX, TEMP, .L20 /* inc_x=1 */ + bne INCY, TEMP, .L20 /* inc_y=1 */ + + /* !((inc_x == 1) && (inc_y == 1)) */ + + /* init $xr8 and $xr9 to zero */ +#ifdef DOUBLE + xvldrepl.d $xr0, X, 0 +#else + xvldrepl.w $xr0, X, 0 +#endif +#ifdef DSDOT + xvfcvtl.d.s $xr0, $xr0 + xvfsub.d $xr8, $xr0, $xr0 + xvfsub.d $xr9, $xr0, $xr0 +#else + XVFSUB $xr8, $xr0, $xr0 + XVFSUB $xr9, $xr0, $xr0 +#endif + +#ifdef DOUBLE + srai.d I, N, 4 +#else + srai.d I, N, 5 +#endif + bge $r0, I, .L12 /* FLOAT: <32 ; DOUBLE: <16 */ + .align 3 +.L11: + /* FLOAT: 32~ ; DOUBLE: 16~ */ + xvld $xr0, X, 0 + xvld $xr1, X, 32 + xvld $xr2, X, 64 + xvld $xr3, X, 96 + xvld $xr4, Y, 0 + xvld $xr5, Y, 32 + xvld $xr6, Y, 64 + xvld $xr7, Y, 96 + addi.w I, I, -1 + addi.d X, X, 128 + addi.d Y, Y, 128 +#ifdef DSDOT + xvfcvtl.d.s $xr10, $xr0 + xvfcvtl.d.s $xr11, $xr4 + xvfcvth.d.s $xr12, $xr0 + xvfcvth.d.s $xr13, $xr4 + xvfmadd.d $xr8, $xr10, $xr12, $xr8 + xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfcvtl.d.s $xr10, $xr1 + xvfcvtl.d.s $xr11, $xr5 + xvfcvth.d.s $xr12, $xr1 + xvfcvth.d.s $xr13, $xr5 + xvfmadd.d $xr8, $xr10, $xr12, $xr8 + xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfcvtl.d.s $xr10, $xr2 + xvfcvtl.d.s $xr11, $xr6 + xvfcvth.d.s $xr12, $xr2 + xvfcvth.d.s $xr13, $xr6 + xvfmadd.d $xr8, $xr10, $xr12, $xr8 + xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfcvtl.d.s $xr10, $xr3 + xvfcvtl.d.s $xr11, $xr7 + xvfcvth.d.s $xr12, $xr3 + xvfcvth.d.s $xr13, $xr7 + xvfmadd.d $xr8, $xr10, $xr12, $xr8 + xvfmadd.d $xr9, $xr11, $xr13, $xr9 +#else + XVFMADD $xr8, $xr0, $xr4, $xr8 + XVFMADD $xr9, $xr1, $xr5, $xr9 + XVFMADD $xr8, $xr2, $xr6, $xr8 + XVFMADD $xr9, $xr3, $xr7, $xr9 +#endif + bnez I, .L11 + .align 3 +.L12: +#ifdef DOUBLE + andi I, N, 0xf + srai.d I, I, 2 +#else + andi I, N, 0x1f + srai.d I, I, 3 +#endif + bge $r0, I, .L14 /* DOUBLE: <4 ; FLOAT: <8 */ + .align 3 +.L13: + /* FLOAT: 8~31 ; DOUBLE: 4~15 */ + xvld $xr0, X, 0 + xvld $xr4, Y, 0 + addi.w I, I, -1 + addi.d X, X, 32 + addi.d Y, Y, 32 +#ifdef DSDOT + xvfcvtl.d.s $xr10, $xr0 + xvfcvtl.d.s $xr11, $xr4 + xvfcvth.d.s $xr12, $xr0 + xvfcvth.d.s $xr13, $xr4 + xvfmadd.d $xr8, $xr10, $xr12, $xr8 + xvfmadd.d $xr9, $xr11, $xr13, $xr9 +#else + XVFMADD $xr8, $xr0, $xr4, $xr8 +#endif + bnez I, .L13 + .align 3 +.L14: + /* store dot in s1 $f8 */ +#ifdef DSDOT + xvfadd.d $xr8, $xr8, $xr9 + fsub.s s2, s2, s2, /* set s2 to 0.0 */ + xvpermi.q $xr0, $xr8, 0x1 + vfadd.d $vr8, $vr8, $vr0 + vpackod.d $vr0, $vr8, $vr8 + vfadd.d $vr8, $vr8, $vr0 +#else + XVFADD $xr8, $xr8, $xr9 + SUB s2, s2, s2 /* set s2 to 0.0 */ + xvpermi.q $xr0, $xr8, 0x1 + VFADD $vr8, $vr8, $vr0 + vpackod.d $vr0, $vr8, $vr8 +#ifdef DOUBLE + VFADD $vr8, $vr8, $vr0 +#else + VFADD $vr8, $vr8, $vr0 + vpackod.w $vr0, $vr8, $vr8 + VFADD $vr8, $vr8, $vr0 +#endif /* defined DOUBLE */ +#endif /* defined DSDOT */ + .align 3 +.L15: +#ifdef DOUBLE + andi I, N, 0x3 +#else + andi I, N, 0x7 +#endif + bge $r0, I, .L999 /* =0 */ + .align 3 +.L16: + /* FLOAT: 1~7 ; DOUBLE: 1~3 */ + LD a1, X, 0 + LD b1, Y, 0 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + addi.d I, I, -1 + addi.d X, X, SIZE + addi.d Y, Y, SIZE + bnez I, .L16 + b .L999 + .align 3 + +.L20: +/* !((inc_x == 1) && (inc_y == 1)) */ + srai.d I, N, 3 +#ifdef F_INTERFACE + bgez INCX, .L21 + addi.d TEMP, N, -1 + mult TEMP, INCX + mflo TEMP + dsub X, X, TEMP + .align 3 + +.L21: + bgez INCY, .L22 + addi.d TEMP, N, -1 + mult TEMP, INCY + mflo TEMP + dsub Y, Y, TEMP + .align 3 + +.L22: +#endif + bge $r0, I, .L25 /* <8 */ + .align 3 + +.L23: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + blt $r0, I, .L23 + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + blt $r0, I, .L26 + .align 3 + +.L999: +#ifdef DSDOT + fadd.d $f0, s1, s2 +#else + ADD $f0, s1, s2 +#endif + move $r4, $r17 + jirl $r0, $r1, 0x0 + +EPILOGUE diff --git a/kernel/loongarch64/dot_lsx.S b/kernel/loongarch64/dot_lsx.S new file mode 100644 index 000000000..f7f613553 --- /dev/null +++ b/kernel/loongarch64/dot_lsx.S @@ -0,0 +1,364 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 + +#define I $r17 +#define TEMP $r18 + +/* Don't change following FR unless you know the effects. */ +#define s1 $f8 +#define s2 $f9 +#define a1 $f10 +#define b1 $f11 + +PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + /* init $f8 and $f9 to zero */ + SUB s1, s1, s1 + SUB s2, s2, s2 + slli.d INCX, INCX, BASE_SHIFT + li.d TEMP, SIZE + slli.d INCY, INCY, BASE_SHIFT + bge $r0, N, .L999 + bne INCX, TEMP, .L20 /* inc_x=1 */ + bne INCY, TEMP, .L20 /* inc_y=1 */ + + /* !((inc_x == 1) && (inc_y == 1)) */ + + /* init $vr8 and $vr9 to zero */ +#ifdef DOUBLE + vldrepl.d $vr0, X, 0 +#else + vldrepl.w $vr0, X, 0 +#endif +#ifdef DSDOT + vfcvtl.d.s $vr0, $vr0 + vfsub.d $vr8, $vr0, $vr0 + vfsub.d $vr9, $vr0, $vr0 +#else + VFSUB $vr8, $vr0, $vr0 + VFSUB $vr9, $vr0, $vr0 +#endif + +#ifdef DOUBLE + srai.d I, N, 3 +#else + srai.d I, N, 4 +#endif + bge $r0, I, .L12 /* FLOAT: <16 ; DOUBLE: <8 */ + .align 3 +.L11: + /* FLOAT: 16~ ; DOUBLE: 8~ */ + vld $vr0, X, 0 + vld $vr1, X, 16 + vld $vr2, X, 32 + vld $vr3, X, 48 + vld $vr4, Y, 0 + vld $vr5, Y, 16 + vld $vr6, Y, 32 + vld $vr7, Y, 48 + addi.w I, I, -1 + addi.d X, X, 64 + addi.d Y, Y, 64 +#ifdef DSDOT + vfcvtl.d.s $vr10, $vr0 + vfcvtl.d.s $vr11, $vr4 + vfcvth.d.s $vr12, $vr0 + vfcvth.d.s $vr13, $vr4 + vfmadd.d $vr8, $vr10, $vr12, $vr8 + vfmadd.d $vr9, $vr11, $vr13, $vr9 + vfcvtl.d.s $vr10, $vr1 + vfcvtl.d.s $vr11, $vr5 + vfcvth.d.s $vr12, $vr1 + vfcvth.d.s $vr13, $vr5 + vfmadd.d $vr8, $vr10, $vr12, $vr8 + vfmadd.d $vr9, $vr11, $vr13, $vr9 + vfcvtl.d.s $vr10, $vr2 + vfcvtl.d.s $vr11, $vr6 + vfcvth.d.s $vr12, $vr2 + vfcvth.d.s $vr13, $vr6 + vfmadd.d $vr8, $vr10, $vr12, $vr8 + vfmadd.d $vr9, $vr11, $vr13, $vr9 + vfcvtl.d.s $vr10, $vr3 + vfcvtl.d.s $vr11, $vr7 + vfcvth.d.s $vr12, $vr3 + vfcvth.d.s $vr13, $vr7 + vfmadd.d $vr8, $vr10, $vr12, $vr8 + vfmadd.d $vr9, $vr11, $vr13, $vr9 +#else + VFMADD $vr8, $vr0, $vr4, $vr8 + VFMADD $vr9, $vr1, $vr5, $vr9 + VFMADD $vr8, $vr2, $vr6, $vr8 + VFMADD $vr9, $vr3, $vr7, $vr9 +#endif + bnez I, .L11 + .align 3 +.L12: +#ifdef DOUBLE + andi I, N, 0x7 + srai.d I, I, 1 +#else + andi I, N, 0xf + srai.d I, I, 2 +#endif + bge $r0, I, .L14 /* DOUBLE: <2 ; FLOAT: <4 */ + .align 3 +.L13: + /* FLOAT: 4~15 ; DOUBLE: 2~7 */ + vld $vr0, X, 0 + vld $vr4, Y, 0 + addi.w I, I, -1 + addi.d X, X, 16 + addi.d Y, Y, 16 +#ifdef DSDOT + vfcvtl.d.s $vr10, $vr0 + vfcvtl.d.s $vr11, $vr4 + vfcvth.d.s $vr12, $vr0 + vfcvth.d.s $vr13, $vr4 + vfmadd.d $vr8, $vr10, $vr12, $vr8 + vfmadd.d $vr9, $vr11, $vr13, $vr9 +#else + VFMADD $vr8, $vr0, $vr4, $vr8 +#endif + bnez I, .L13 + .align 3 +.L14: + /* store dot in s1 $f8 */ +#ifdef DSDOT + vfadd.d $vr8, $vr8, $vr9 + fsub.s s2, s2, s2, /* set s2 to 0.0 */ + vpackod.d $vr0, $vr8, $vr8 + vfadd.d $vr8, $vr8, $vr0 +#else + VFADD $vr8, $vr8, $vr9 + SUB s2, s2, s2 /* set s2 to 0.0 */ + vpackod.d $vr0, $vr8, $vr8 +#ifdef DOUBLE + VFADD $vr8, $vr8, $vr0 +#else + VFADD $vr8, $vr8, $vr0 + vpackod.w $vr0, $vr8, $vr8 + VFADD $vr8, $vr8, $vr0 +#endif /* defined DOUBLE */ +#endif /* defined DSDOT */ + .align 3 +.L15: +#ifdef DOUBLE + andi I, N, 0x1 +#else + andi I, N, 0x3 +#endif + bge $r0, I, .L999 /* =0 */ + .align 3 +.L16: + /* DOUBLE: 1 ; FLOAT: 1~3 */ + LD a1, X, 0 + LD b1, Y, 0 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + addi.d I, I, -1 + addi.d X, X, SIZE + addi.d Y, Y, SIZE + bnez I, .L16 + b .L999 + .align 3 + +.L20: +/* !((inc_x == 1) && (inc_y == 1)) */ + srai.d I, N, 3 +#ifdef F_INTERFACE + bgez INCX, .L21 + addi.d TEMP, N, -1 + mult TEMP, INCX + mflo TEMP + dsub X, X, TEMP + .align 3 + +.L21: + bgez INCY, .L22 + addi.d TEMP, N, -1 + mult TEMP, INCY + mflo TEMP + dsub Y, Y, TEMP + .align 3 + +.L22: +#endif + bge $r0, I, .L25 /* <8 */ + .align 3 + +.L23: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + blt $r0, I, .L23 + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + blt $r0, I, .L26 + .align 3 + +.L999: +#ifdef DSDOT + fadd.d $f0, s1, s2 +#else + ADD $f0, s1, s2 +#endif + move $r4, $r17 + jirl $r0, $r1, 0x0 + +EPILOGUE diff --git a/kernel/loongarch64/dscal_lasx.S b/kernel/loongarch64/dscal_lasx.S new file mode 100644 index 000000000..153662378 --- /dev/null +++ b/kernel/loongarch64/dscal_lasx.S @@ -0,0 +1,194 @@ +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VT0 $xr14 +#define VT1 $xr15 +#define VALPHA $xr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + movgr2fr.d a2, TEMP + ffint.d.l a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + fcmp.ceq.d $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + movfr2gr.d TEMP, ALPHA + xvreplgr2vr.d VALPHA, TEMP + move XX, X + .align 3 + +.L10: + bge $r0, I, .L32 + .align 3 +.L11: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvfmul.d VT1, VX1, VALPHA + xvstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 3 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L23: + fst.d a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 +.L25: + xvxor.v VX0, VX0, VX0 + xvst VX0, X, 0 * SIZE + xvst VX0, X, 4 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L27: + fst.d a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + movfr2gr.d TEMP, ALPHA + xvreplgr2vr.d VALPHA , TEMP + .align 3 + +.L31: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VT0, VX0, VALPHA + xvfmul.d VT1, VX1, VALPHA + addi.d I, I, -1 + xvst VT0, X, 0 * SIZE + xvst VT1, X, 4 * SIZE + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L33: + fld.d a1, X, 0 * SIZE + addi.d I, I, -1 + fmul.d a1, ALPHA, a1 + fst.d a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dscal_lsx.S b/kernel/loongarch64/dscal_lsx.S new file mode 100644 index 000000000..55f497752 --- /dev/null +++ b/kernel/loongarch64/dscal_lsx.S @@ -0,0 +1,205 @@ +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VT0 $vr14 +#define VT1 $vr15 +#define VALPHA $vr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + movgr2fr.d a2, TEMP + ffint.d.l a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + fcmp.ceq.d $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + movfr2gr.d TEMP, ALPHA + vreplgr2vr.d VALPHA, TEMP + move XX, X + .align 3 + +.L10: //ALPHA!=0|1 and INCX!=1 + bge $r0, I, .L32 + .align 3 + +.L11: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT1, VX1, VALPHA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT1, VX1, VALPHA + vstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L23: + fst.d a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 +.L25: + vxor.v VX0, VX0, VX0 + vst VX0, X, 0 * SIZE + vst VX0, X, 2 * SIZE + vst VX0, X, 4 * SIZE + vst VX0, X, 6 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L27: + fst.d a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + movfr2gr.d TEMP, ALPHA + vreplgr2vr.d VALPHA , TEMP + .align 3 + +.L31: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VT0, VX0, VALPHA + vfmul.d VT1, VX1, VALPHA + vld VX0, X, 4 * SIZE + vst VT0, X, 0 * SIZE + vst VT1, X, 2 * SIZE + vfmul.d VT0, VX0, VALPHA + vld VX1, X, 6 * SIZE + vst VT0, X, 4 * SIZE + vfmul.d VT1, VX1, VALPHA + vst VT1, X, 6 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L33: + fld.d a1, X, 0 * SIZE + addi.d I, I, -1 + fmul.d a1, ALPHA, a1 + fst.d a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/iamax_lasx.S b/kernel/loongarch64/iamax_lasx.S new file mode 100644 index 000000000..090da3004 --- /dev/null +++ b/kernel/loongarch64/iamax_lasx.S @@ -0,0 +1,542 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfmaxa.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmaxa.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 +#else + xvld VX0, X, 0 * SIZE + addi.d I, I, -1 + xvadd.w VI1, VI1, VINC8 + xvfmaxa.s VM1, VX0, VM0 + xvfcmp.ceq.s VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 +#endif + XVFMAXA VM1, x1, x2 + XVCMPEQ VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + XVFMAXA VM0, x3, x4 + XVCMPEQ VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + XVFMAXA VM0, VM0, VM1 + XVCMPEQ VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + CMPEQ $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t2, 1 + xvinsgr2vr.w VM0, t3, 2 + xvinsgr2vr.w VM0, t4, 3 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 4 + xvinsgr2vr.w VM0, t2, 5 + xvinsgr2vr.w VM0, t3, 6 + xvinsgr2vr.w VM0, t4, 7 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfmaxa.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmaxa.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvadd.w VI1, VI1, VINC8 + xvfmaxa.s VM1, VX0, VM0 + xvfcmp.ceq.s VT0, VM1, VM0 + addi.d I, I, -1 + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.d VM0, x4, x3 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmaxa.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfmaxa.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + CMPEQ $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 +#else + fmov.s $f16, $f20 +#endif + .align 3 + +#ifdef DOUBLE + +#else +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfmaxa.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 +.L272: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + xvfmaxa.s VM0, VX0, VM0 + xvfcmp.ceq.s VT0, VM0, VX0 + xvbitsel.v VI0, VI0, VI1, VT0 + movfr2gr.s i0, $f20 +#endif + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD $f9, X, 0 + addi.d I, I, -1 + XVFMAXA VM1, x1, VM0 + XVCMPEQ VT0, VM0, VM1 + add.d X, X, INCX + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/iamax_lsx.S b/kernel/loongarch64/iamax_lsx.S new file mode 100644 index 000000000..ce5b3c724 --- /dev/null +++ b/kernel/loongarch64/iamax_lsx.S @@ -0,0 +1,482 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC2 $vr17 +#define VINC4 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L11 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC2, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfmaxa.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfmaxa.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmaxa.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + vbitsel.v x2, x4, x2, VT0 + vfmaxa.d VM1, VM0, x3 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, x2, VI0, VT0 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE +#else + vld VX0, X, 0 * SIZE + vadd.w VI1, VI1, VINC4 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC2 + vfmaxa.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + addi.d I, I, -1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmaxa.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L16 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L17 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC2, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC4, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC4, VINC2, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +#ifdef DOUBLE +.L16: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L17: + movfr2gr.d i0, $f20 + .align 3 + +.L11: //INCX==1 and N<8 + andi I, N, 7 + bge $r0, I, .L14 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L13: + fld.d $f9, X, 0 + vfmaxa.d VM1, x1, VM0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d I, I, -1 + addi.d i1, i1, 1 + addi.d X, X, SIZE + movgr2fr.d $f21, i1 + blt $r0, I, .L13 + movfr2gr.d i0, $f20 + .align 3 + +.L14: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmaxa.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmaxa.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmaxa.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + vbitsel.v x2, x4, x2, VT0 + vfmaxa.d VM1, VM0, x3 + vbitsel.v VM0, VM1, VM0, VT0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VI0, x2, VI0, VT0 + addi.d I, I, -1 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 + .align 3 + +.L26: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 + +#else +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC2, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC4 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC2 + vfmaxa.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmaxa.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d I, I, -1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC2, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC4, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC4, VINC2, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +#endif +.L21: // N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD $f9, X, 0 + VFMAXA VM1, x1, VM0 + VCMPEQ VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d I, I, -1 + addi.d i1, i1, 1 + add.d X, X, INCX + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/iamin_lasx.S b/kernel/loongarch64/iamin_lasx.S new file mode 100644 index 000000000..6ea117907 --- /dev/null +++ b/kernel/loongarch64/iamin_lasx.S @@ -0,0 +1,486 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfmina.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmina.d VM1, VM0, VM1 +#else + addi.d I, I, -1 + xvadd.w VI2, VI1, VINC8 + xvfmina.s VM1, VX0, VM0 +#endif + XVCMPEQ VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmina.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmina.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 +#endif + xvbitsel.v VINC4, VI2, VI1, VT0 + XVFMINA VM0, x4, x3 + XVCMPEQ VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + XVFMINA VM0, VM0, VM1 + XVCMPEQ VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t2, 1 + xvinsgr2vr.w VM0, t3, 2 + xvinsgr2vr.w VM0, t4, 3 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 4 + xvinsgr2vr.w VM0, t2, 5 + xvinsgr2vr.w VM0, t3, 6 + xvinsgr2vr.w VM0, t4, 7 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfmina.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmina.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvadd.w VI2, VI1, VINC8 + xvfmina.s VM1, VX0, VM0 + xvfcmp.ceq.s VT0, VM1, VM0 +#endif + addi.d I, I, -1 + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 + .align 3 +#else + fmov.s $f16, $f20 + .align 3 + +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfmina.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmina.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmina.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + xvfmina.s VM0, VX0, VM0 + xvfcmp.ceq.s VT0, VM0, VX0 + xvbitsel.v VI0, VI0, VI1, VT0 + movfr2gr.s i0, $f20 + +#endif + +.L21: // N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD $f9, X, 0 + addi.d I, I, -1 + XVFMINA VM1, x1, VM0 + XVCMPEQ VT0, VM0, VM1 + add.d X, X, INCX + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/iamin_lsx.S b/kernel/loongarch64/iamin_lsx.S new file mode 100644 index 000000000..ce885fd88 --- /dev/null +++ b/kernel/loongarch64/iamin_lsx.S @@ -0,0 +1,446 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#ifdef DOUBLE +#define VINC2 $vr17 +#define VINC4 $vr18 +#else +#define VINC4 $vr17 +#define VINC8 $vr18 +#endif +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfmina.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfmina.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmina.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + addi.d I, I, -1 + vbitsel.v x2, x4, x2, VT0 + vfmina.d VM1, VM0, x3 +#else + vadd.w VI1, VI1, VINC8 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC4 + vfmina.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + addi.d I, I, -1 + vbitsel.v x2, VI2, VI1, VT0 + vfmina.s VM1, VM0, VM1 +#endif + VCMPEQ VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, x2, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + + vreplvei.d VI1, VI0, 0 + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmina.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmina.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmina.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + addi.d I, I, -1 + vbitsel.v x2, x4, x2, VT0 + vfmina.d VM1, VM0, x3 + vbitsel.v VM0, VM1, VM0, VT0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VI0, x2, VI0, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC4 + vfmina.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmina.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d I, I, -1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + vfmina.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 +#endif + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD $f9, X, 0 + addi.d I, I, -1 + VFMINA VM1, x1, VM0 + VCMPEQ VT0, VM0, VM1 + add.d X, X, INCX + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + MTC $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamax_lasx.S b/kernel/loongarch64/icamax_lasx.S new file mode 100644 index 000000000..7800cb917 --- /dev/null +++ b/kernel/loongarch64/icamax_lasx.S @@ -0,0 +1,562 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + xvxor.v VM0, VM0, VM0 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + xvxor.v VI3, VI3, VI3 // 0 +#ifdef DOUBLE + li.d I, -1 + xvreplgr2vr.d VI4, I + xvffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + li.w I, -1 + xvreplgr2vr.w VI4, I + xvffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC4 + xvld VX1, X, 4 * SIZE + addi.d I, I, -1 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 +#else + xvadd.w VI1, VI1, VINC8 + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VI4, x1 + xvfmul.s x4, VI4, x2 + xvfcmp.clt.s VT0, x1, VI3 + xvfcmp.clt.s VINC4, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC4 +#endif + XVFADD x1, x1, x2 + XVFMAX x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmax.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmax.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmax.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + xvadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + addi.d I, I, -1 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + xvfadd.d x1, x1, x2 + xvfmax.d x3, VM0, x1 + xvfcmp.ceq.d VT0, x3, VM0 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VI4, x1 + xvfmul.s x4, VI4, x2 + xvfcmp.clt.s VT0, x1, VI3 + xvfcmp.clt.s VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + xvfadd.s x1, x1, x2 + xvfmax.s x3, VM0, x1 + xvfcmp.ceq.s VT0, x3, VM0 +#endif + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 +#else + fmov.s $f16, $f20 +#endif + .align 3 + +#ifdef DOUBLE +#else +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + fcmp.clt.s $fcc0, $f15, $f13 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + movfr2gr.s i0, $f20 + +#endif +.L21: //N<8 +#ifdef DOUBLE + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 +#else + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 +#endif + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMAX a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamax_lsx.S b/kernel/loongarch64/icamax_lsx.S new file mode 100644 index 000000000..a2fc9dbbd --- /dev/null +++ b/kernel/loongarch64/icamax_lsx.S @@ -0,0 +1,434 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + vxor.v VM0, VM0, VM0 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + vxor.v VI3, VI3, VI3 // 0 +#ifdef DOUBLE + li.d I, -1 + vreplgr2vr.d VI4, I + vffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + li.w I, -1 + vreplgr2vr.w VI4, I + vffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + addi.d I, I, -1 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmax.d x3, VM0, x1 + vfcmp.ceq.d VT0, x3, VM0 + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 +#else + vadd.w VI1, VI1, VINC4 + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VI4, x1 + vfmul.s x4, VI4, x2 +#endif + VCMPLT VT0, x1, VI3 + VCMPLT VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + VFADD x1, x1, x2 + VFMAX x3, VM0, x1 + VCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmax.d x3, VM0, x1 + ld.d t1, X, 0 * SIZE + vfcmp.ceq.d VT0, x3, VM0 + ld.d t2, X, 1 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 + addi.d I, I, -1 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmax.d x3, VM0, x1 + vfcmp.ceq.d VT0, x3, VM0 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vadd.w VI1, VI1, VINC4 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VI4, x1 + vfmul.s x4, VI4, x2 + vfcmp.clt.s VT0, x1, VI3 + vfcmp.clt.s VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.s x1, x1, x2 + vfmax.s x3, VM0, x1 + vfcmp.ceq.s VT0, x3, VM0 +#endif + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +#ifdef DOUBLE +.L26: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 +#else +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +#endif +.L21: //N<4 + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMAX a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamin_lasx.S b/kernel/loongarch64/icamin_lasx.S new file mode 100644 index 000000000..01abd45b2 --- /dev/null +++ b/kernel/loongarch64/icamin_lasx.S @@ -0,0 +1,555 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + xvreplve0.d VM0, VM0 + xvxor.v VI3, VI3, VI3 // 0 + li.d I, -1 + xvreplgr2vr.d VI4, I + xvffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + xvreplve0.w VM0, VM0 + xvxor.v VI3, VI3, VI3 // 0 + li.w I, -1 + xvreplgr2vr.w VI4, I + xvffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC4 + xvld VX1, X, 4 * SIZE + addi.d I, I, -1 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 +#else + xvadd.w VI1, VI1, VINC8 + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VI4, x1 + xvfmul.s x4, VI4, x2 + xvfcmp.clt.s VT0, x1, VI3 + xvfcmp.clt.s VINC4, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC4 +#endif + XVFADD x1, x1, x2 + XVFMIN x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmin.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmin.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmin.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + xvadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif + addi.d I, I, -1 + XVFMUL x3, VI4, x1 + XVFMUL x4, VI4, x2 + XVCMPLT VT0, x1, VI3 + XVCMPLT VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + XVFADD x1, x1, x2 + XVFMIN x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmina.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmina.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmina.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 +#endif + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 + .align 3 + +.L21: //N<4 + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 +#else + fmov.s $f16, $f20 + .align 3 + +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + fcmp.clt.s $fcc0, $f15, $f13 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + movfr2gr.s i0, $f20 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 +#endif + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMIN a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamin_lsx.S b/kernel/loongarch64/icamin_lsx.S new file mode 100644 index 000000000..a08cd33c5 --- /dev/null +++ b/kernel/loongarch64/icamin_lsx.S @@ -0,0 +1,425 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 + vreplvei.w VM0, VM0, 0 + vxor.v VI3, VI3, VI3 // 0 +#ifdef DOUBLE + li.d I, -1 + vreplgr2vr.d VI4, I + vffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + li.w I, -1 + vreplgr2vr.w VI4, I + vffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + addi.d I, I, -1 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmin.d x3, VM0, x1 + vfcmp.ceq.d VT0, x3, VM0 + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vadd.w VI1, VI1, VINC4 + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFMUL x3, VI4, x1 + VFMUL x4, VI4, x2 + VCMPLT VT0, x1, VI3 + VCMPLT VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + VFADD x1, x1, x2 + VFMIN x3, VM0, x1 + VCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmin.d x3, VM0, x1 + ld.d t1, X, 0 * SIZE + vfcmp.ceq.d VT0, x3, VM0 + ld.d t2, X, 1 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vadd.w VI1, VI1, VINC4 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + addi.d I, I, -1 + VFMUL x3, VI4, x1 + VFMUL x4, VI4, x2 + VCMPLT VT0, x1, VI3 + VCMPLT VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + VFADD x1, x1, x2 + VFMIN x3, VM0, x1 + VCMPEQ VT0, x3, VM0 + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + vfmina.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 +#endif + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: +#ifdef DOUBLE + movfr2gr.d i0, $f20 + .align 3 +#else + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +#endif +.L21: //N<4 + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMIN a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/imax_lasx.S b/kernel/loongarch64/imax_lasx.S new file mode 100644 index 000000000..2d3d5e9d3 --- /dev/null +++ b/kernel/loongarch64/imax_lasx.S @@ -0,0 +1,533 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfcmp.clt.d VT0, VX0, VX1 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + xvfcmp.clt.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else + xvadd.w VI1, VI1, VINC8 + xvfcmp.clt.s VT0, VM0, VX0 + addi.d I, I, -1 + xvbitsel.v VM0, VM0, VX0, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 + addi.d X, X, 8 * SIZE +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfcmp.clt.d VT0, x1, x2 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 +#endif + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + XVCMPLT VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + XVCMPLT VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 0 + xvinsgr2vr.w VM0, t2, 1 + xvinsgr2vr.w VM0, t3, 2 + xvinsgr2vr.w VM0, t4, 3 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t2, TEMP, 0 * SIZE +#endif + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX +#ifdef DOUBLE + xvinsgr2vr.d VM0, t1, 0 + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + xvinsgr2vr.w VM0, t1, 4 + xvinsgr2vr.w VM0, t2, 5 + xvinsgr2vr.w VM0, t3, 6 + xvinsgr2vr.w VM0, t4, 7 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI1, VI1, VINC8 + xvadd.d VI2, VI1, VINC4 + xvfcmp.clt.d VT0, VX0, VX1 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + xvfcmp.clt.d VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvadd.w VI1, VI1, VINC8 + xvfcmp.clt.s VT0, VM0, VX0 + addi.d I, I, -1 + xvbitsel.v VM0, VM0, VX0, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 +#endif + XVCMPLT VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + XVCMPLT VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + XVCMPLT VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 +#else + fmov.s $f16, $f20 +#endif + .align 3 + +#ifndef DOUBLE +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + fcmp.clt.s $fcc0, $f15, $f13 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + movfr2gr.s i0, $f20 +#endif + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + CMPLT $fcc0, $f15, $f9 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/imax_lsx.S b/kernel/loongarch64/imax_lsx.S new file mode 100644 index 000000000..92556d4e6 --- /dev/null +++ b/kernel/loongarch64/imax_lsx.S @@ -0,0 +1,428 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, $vr18 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, $vr17 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x1, x3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, VM0, x1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + vadd.w VI1, VI1, $vr18 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VCMPLT VT0, x1, x2 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + VCMPLT VT0, x3, x4 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + VCMPLT VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, $vr18 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, $vr17 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x1, x3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, VM0, x1 + addi.d I, I, -1 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, $vr18 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfcmp.clt.s VT0, x1, x2 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + vfcmp.clt.s VT0, x3, x4 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + vfcmp.clt.s VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + VCMPLT VT0, x1, x2 + vbitsel.v VM0, x1, x2, VT0 + vbitsel.v VI0, VI1, VI2, VT0 +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + VCMPLT VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + .align 3 + +.L27: +#ifndef DOUBLE + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + VCMPLT VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + VCMPLT VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#endif + MTG i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + CMPLT $fcc0, $f15, $f9 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/imin_lasx.S b/kernel/loongarch64/imin_lasx.S new file mode 100644 index 000000000..5306828e2 --- /dev/null +++ b/kernel/loongarch64/imin_lasx.S @@ -0,0 +1,534 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + XVCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + XVCMPLT VT0, VM1, VM0 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else + xvadd.w VI1, VI1, VINC8 + XVCMPLT VT0, VX0, VM0 + addi.d I, I, -1 + xvbitsel.v VM0, VM0, VX0, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 + addi.d X, X, 8 * SIZE +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 +#endif + XVCMPLT VT0, x2, x1 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + XVCMPLT VT0, x4, x3 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + XVCMPLT VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 0 + xvinsgr2vr.w VM0, t2, 1 + xvinsgr2vr.w VM0, t3, 2 + xvinsgr2vr.w VM0, t4, 3 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 4 + xvinsgr2vr.w VM0, t2, 5 + xvinsgr2vr.w VM0, t3, 6 + xvinsgr2vr.w VM0, t4, 7 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI1, VI1, VINC8 + xvadd.d VI2, VI1, VINC4 + XVCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + XVCMPLT VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvadd.w VI1, VI1, VINC8 + XVCMPLT VT0, VX0, VM0 + addi.d I, I, -1 + xvbitsel.v VM0, VM0, VX0, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 +#endif + XVCMPLT VT0, x2, x1 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + XVCMPLT VT0, x4, x3 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + XVCMPLT VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + MTG i0, $f20 +#else + fmov.s $f16, $f20 +#endif + .align 3 + +#ifndef DOUBLE +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + XVCMPLT VT0, x2, x1 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + XVCMPLT VT0, x4, x3 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + XVCMPLT VT0, x1, VM0 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L262 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L272 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L282 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L292 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + CMPLT $fcc0, $f13, $f15 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + MTG i0, $f20 +#endif + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + CMPLT $fcc0, $f9, $f15 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/imin_lsx.S b/kernel/loongarch64/imin_lsx.S new file mode 100644 index 000000000..a0c411e7a --- /dev/null +++ b/kernel/loongarch64/imin_lsx.S @@ -0,0 +1,428 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, $vr18 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, $vr17 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x3, x1 + addi.d X, X, 8 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, x1, VM0 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + vadd.w VI1, VI1, $vr18 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM1, VM0 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VCMPLT VT0, x2, x1 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + VCMPLT VT0, x4, x3 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + VCMPLT VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, $vr18 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, $vr17 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x3, x1 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, x1, VM0 + addi.d I, I, -1 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, $vr18 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VCMPLT VT0, x2, x1 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + VCMPLT VT0, x4, x3 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + VCMPLT VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + VCMPLT VT0, x2, x1 + vbitsel.v VM0, x1, x2, VT0 + vbitsel.v VI0, VI1, VI2, VT0 +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + VCMPLT VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + .align 3 + +.L27: +#ifndef DOUBLE + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + VCMPLT VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + VCMPLT VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#endif + MTG i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + CMPLT $fcc0, $f9, $f15 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/max_lasx.S b/kernel/loongarch64/max_lasx.S new file mode 100644 index 000000000..3215ae394 --- /dev/null +++ b/kernel/loongarch64/max_lasx.S @@ -0,0 +1,229 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $xr0 +#define VM1 $xr1 +#define VM2 $xr2 +#define VX0 $xr3 +#define VX1 $xr4 +#define VX2 $xr5 +#define VX3 $xr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + xvldrepl.d VM0, X, 0 +#else + xvldrepl.w VM0, X, 0 +#endif + bne INCX, TEMP, .L20 + + srai.d I, N, 4 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvld VX2, X, 64 + xvld VX3, X, 96 + addi.d I, I, -1 + addi.d X, X, 128 + XVFMAX VM1, VX0, VX1 + XVFMAX VM2, VX2, VX3 + XVFMAX VM0, VM0, VM1 + XVFMAX VM0, VM0, VM2 +#else + xvld VX0, X, 0 + xvld VX1, X, 32 + addi.d I, I, -1 + addi.d X, X, 64 + XVFMAX VM1, VX0, VX1 + XVFMAX VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMAX VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMAX VM1, VX0, VX1 + XVFMAX VM2, VX2, VX3 + XVFMAX VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 0x1 + XVFMAX VM0, VM0, VM1 + .align 3 + +.L11: + andi I, N, 0x0f + bge $r0, I, .L13 + .align 3 + +.L12: /* 0 < N < 16 */ + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMAX $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 0 + xvinsgr2vr.w VM1, t2, 1 + xvinsgr2vr.w VM1, t3, 2 + xvinsgr2vr.w VM1, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 4 + xvinsgr2vr.w VM1, t2, 5 + xvinsgr2vr.w VM1, t3, 6 + xvinsgr2vr.w VM1, t4, 7 + xvfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMAX VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMAX VM1, VX0, VX1 + XVFMAX VM2, VX2, VX3 + XVFMAX VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 1 + XVFMAX VM0, VM0, VM1 + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: /* 0 < N < 8 */ + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMAX $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/max_lsx.S b/kernel/loongarch64/max_lsx.S new file mode 100644 index 000000000..a2100875d --- /dev/null +++ b/kernel/loongarch64/max_lsx.S @@ -0,0 +1,228 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $vr0 +#define VM1 $vr1 +#define VM2 $vr2 +#define VX0 $vr3 +#define VX1 $vr4 +#define VX2 $vr5 +#define VX3 $vr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + vldrepl.d VM0, X, 0 +#else + vldrepl.w VM0, X, 0 +#endif + bne INCX, TEMP, .L20 + + srai.d I, N, 3 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, X, 32 + vld VX3, X, 48 + addi.d I, I, -1 + addi.d X, X, 64 + VFMAX VM1, VX0, VX1 + VFMAX VM2, VX2, VX3 + VFMAX VM0, VM0, VM1 + VFMAX VM0, VM0, VM2 +#else + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + addi.d X, X, 32 + VFMAX VM1, VX0, VX1 + VFMAX VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMAX VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMAX VM1, VX0, VX1 + VFMAX VM2, VX2, VX3 + VFMAX VM0, VM1, VM2 +#endif + .align 3 + +.L11: + andi I, N, 7 + bge $r0, I, .L13 + .align 3 + +.L12: + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMAX $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM1, VM1, VM2 + vfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmaxa.s VM1, VX0, VX1 + vfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMAX VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMAX VM1, VX0, VX1 + VFMAX VM2, VX2, VX3 + VFMAX VM0, VM1, VM2 +#endif + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMAX $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/min_lasx.S b/kernel/loongarch64/min_lasx.S new file mode 100644 index 000000000..890c8882d --- /dev/null +++ b/kernel/loongarch64/min_lasx.S @@ -0,0 +1,229 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $xr0 +#define VM1 $xr1 +#define VM2 $xr2 +#define VX0 $xr3 +#define VX1 $xr4 +#define VX2 $xr5 +#define VX3 $xr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + xvldrepl.d VM0, X, 0 +#else + xvldrepl.w VM0, X, 0 +#endif + bne INCX, TEMP, .L20 + + srai.d I, N, 4 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvld VX2, X, 64 + xvld VX3, X, 96 + addi.d I, I, -1 + addi.d X, X, 128 + XVFMIN VM1, VX0, VX1 + XVFMIN VM2, VX2, VX3 + XVFMIN VM0, VM0, VM1 + XVFMIN VM0, VM0, VM2 +#else + xvld VX0, X, 0 + xvld VX1, X, 32 + addi.d I, I, -1 + addi.d X, X, 64 + XVFMIN VM1, VX0, VX1 + XVFMIN VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMIN VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMIN VM1, VX0, VX1 + XVFMIN VM2, VX2, VX3 + XVFMIN VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 0x1 + XVFMIN VM0, VM0, VM1 + .align 3 + +.L11: + andi I, N, 0x0f + bge $r0, I, .L13 + .align 3 + +.L12: /* 0 < N < 16 */ + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMIN $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 0 + xvinsgr2vr.w VM1, t2, 1 + xvinsgr2vr.w VM1, t3, 2 + xvinsgr2vr.w VM1, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 4 + xvinsgr2vr.w VM1, t2, 5 + xvinsgr2vr.w VM1, t3, 6 + xvinsgr2vr.w VM1, t4, 7 + xvfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMIN VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMIN VM1, VX0, VX1 + XVFMIN VM2, VX2, VX3 + XVFMIN VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 1 + XVFMIN VM0, VM0, VM1 + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: /* 0 < N < 8 */ + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMIN $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/min_lsx.S b/kernel/loongarch64/min_lsx.S new file mode 100644 index 000000000..1bc32a0ed --- /dev/null +++ b/kernel/loongarch64/min_lsx.S @@ -0,0 +1,228 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $vr0 +#define VM1 $vr1 +#define VM2 $vr2 +#define VX0 $vr3 +#define VX1 $vr4 +#define VX2 $vr5 +#define VX3 $vr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + vldrepl.d VM0, X, 0 +#else + vldrepl.w VM0, X, 0 +#endif + bne INCX, TEMP, .L20 + + srai.d I, N, 3 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, X, 32 + vld VX3, X, 48 + addi.d I, I, -1 + addi.d X, X, 64 + VFMIN VM1, VX0, VX1 + VFMIN VM2, VX2, VX3 + VFMIN VM0, VM0, VM1 + VFMIN VM0, VM0, VM2 +#else + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + addi.d X, X, 32 + VFMIN VM1, VX0, VX1 + VFMIN VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMIN VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMIN VM1, VX0, VX1 + VFMIN VM2, VX2, VX3 + VFMIN VM0, VM1, VM2 +#endif + .align 3 + +.L11: + andi I, N, 7 + bge $r0, I, .L13 + .align 3 + +.L12: + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMIN $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM1, VM1, VM2 + vfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmaxa.s VM1, VX0, VX1 + vfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMIN VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMIN VM1, VX0, VX1 + VFMIN VM2, VX2, VX3 + VFMIN VM0, VM1, VM2 +#endif + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMIN $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/rot_lasx.S b/kernel/loongarch64/rot_lasx.S new file mode 100644 index 000000000..5d7e3d7cc --- /dev/null +++ b/kernel/loongarch64/rot_lasx.S @@ -0,0 +1,1602 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VT0 $xr10 +#define VT1 $xr18 +#define VXC $xr23 +#define VXS $xr9 +#define VXZ $xr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT +#ifdef DOUBLE + movfr2gr.d t1, C + xvreplgr2vr.d VXC, t1 + movfr2gr.d t2, S + xvreplgr2vr.d VXS, t2 + movfr2gr.d t3, a1 + xvreplgr2vr.d VXZ, t3 +#else + movfr2gr.s t1, C + xvreplgr2vr.w VXC, t1 + movfr2gr.s t2, S + xvreplgr2vr.w VXS, t2 + movfr2gr.s t3, a1 + xvreplgr2vr.w VXZ, t3 +#endif + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE +#endif + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 + xvst VT0, X, 0 * SIZE + xvst VT1, Y, 0 * SIZE +#ifdef DOUBLE + XVMUL VT0, VX1, VXC + XVFMADD VT0, VX3, VXS, VT0 + XVMUL VT1, VX1, VXS + XVMSUB VT1, VX3, VXC, VT1 + xvst VT0, X, 4 * SIZE + xvst VT1, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE +#endif + XVMUL VT0, VX0, VXC + XVMUL VT1, VX2, VXC + xvst VT0, X, 0 * SIZE + xvst VT1, Y, 0 * SIZE +#ifdef DOUBLE + XVMUL VT0, VX1, VXC + XVMUL VT1, VX3, VXC + xvst VT0, X, 4 * SIZE + xvst VT1, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE +#endif + XVMUL VT0, VX2, VXS + XVMUL VT1, VX0, VXS + XVFSUB VT1, VXZ, VT1 + xvst VT0, X, 0 * SIZE + xvst VT1, Y, 0 * SIZE +#ifdef DOUBLE + XVMUL VT0, VX3, VXS + XVMUL VT1, VX1, VXS + xvfsub.d VT1, VXZ, VT1 + xvst VT0, X, 4 * SIZE + xvst VT1, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + xvst VXZ, X, 0 * SIZE + xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 4 * SIZE + xvst VXZ, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L120 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L122 // C!=0 S==0 + b .L121 // C!=0 S!=0 + .align 3 + +.L120: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L124 // C==0 S==0 + b .L123 // C==0 S!=0 + .align 3 + +.L121: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY +#endif + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 + +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvst VT0, X, 0 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + XVMUL VT0, VX1, VXC + XVFMADD VT0, VX3, VXS, VT0 + XVMUL VT1, VX1, VXS + XVMSUB VT1, VX3, VXC, VT1 + xvst VT0, X, 4 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 +#else + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // C!=0 S==0 +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX0, VXC + xvfmul.d VT1, VX2, VXC + xvld VX1, X, 4 * SIZE + xvst VT0, X, 0 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX1, VXC + xvfmul.d VT1, VX3, VXC + addi.d I, I, -1 + xvst VT0, X, 4 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmul.s VT1, VX2, VXC + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // C==0 S!=0 +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX2, VXS + xvfmul.d VT1, VX0, VXS + xvfsub.d VT1, VXZ, VT1 + xvst VT0, X, 0 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX3, VXS + xvfmul.d VT1, VX1, VXS + xvfsub.d VT1, VXZ, VT1 + addi.d I, I, -1 + xvst VT0, X, 4 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX2, VXS + xvfmul.s VT1, VX0, VXS + xvfsub.s VT1, VXZ, VT1 + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // C==0 S==0 + xvst VXZ, X, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 0 * SIZE + xvst VXZ, X, 4 * SIZE + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 +#else + xvst VXZ, X, 0 * SIZE + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L124 + move Y, YY + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L210 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L212 // C!=0 S==0 + b .L211 // C!=0 S!=0 + .align 3 + +.L210: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L214 // C==0 S==0 + b .L213 // C==0 S!=0 + .align 3 + +.L211: // C!=0 S!=0 +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VXC, VX0 + xvfmadd.d VT0, VX2, VXS, VT0 + xvfmul.d VT1, VXS, VX0 + xvfmsub.d VT1, VX2, VXC, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VX1, VXC + xvfmadd.d VT0, VX3, VXS, VT0 + xvfmul.d VT1, VX1, VXS + xvfmsub.d VT1, VX3, VXC, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXC, VX0 + xvfmadd.s VT0, VX2, VXS, VT0 + xvfmul.s VT1, VX0, VXS + xvfmsub.s VT1, VX2, VXC, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // C!=0 S==0 +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VXC, VX0 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvfmul.d VT1, VX2, VXC + xvst VT1, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VX1, VXC + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvfmul.d VT1, VX3, VXS + xvst VT1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXC, VX0 + xvfmul.s VT1, VX2, VXC + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // C==0 S!=0 +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VXS, VX2 + xvfmul.d VT1, VXS, VX0 + xvfsub.d VT1, VXZ, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VX3, VXS + xvfmul.d VT1, VX1, VXS + xvfsub.d VT1, VXZ, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXS, VX2 + xvfmul.s VT1, VXS, VX0 + xvfsub.s VT1, VXZ, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // C==0 S==0 +#ifdef DOUBLE + xvstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 4 * SIZE +#else + xvstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvstelm.w VXZ, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 7 + add.d XX, XX, INCX +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX0, VXC + xvfmadd.d VT0, VX2, VXS, VT0 + xvfmul.d VT1, VX0, VXS + xvfmsub.d VT1, VX2, VXC, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX1, VXC + xvfmadd.d VT0, VX3, VXS, VT0 + xvfmul.d VT1, VX0, VXS + xvfmsub.d VT1, VX3, VXC, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmadd.s VT0, VX2, VXS, VT0 + xvfmul.s VT1, VX0, VXS + xvfmsub.s VT1, VX2, VXC, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY +#endif + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX0, VXC + xvfmul.d VT1, VX2, VXC + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX1, VXC + xvfmul.d VT1, VX3, VXC + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmul.s VT1, VX2, VXC + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY +#endif + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX2, VXS + xvfmul.d VT1, VX0, VXS + xvfsub.d VT1, VXZ, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX3, VXS + xvfmul.d VT1, VX0, VXS + xvfsub.d VT1, VXZ, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX2, VXS + xvfmul.s VT1, VX0, VXS + xvfsub.s VT1, VXZ, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY +#endif + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + xvstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 +#ifdef DOUBLE + move X, XX + move Y, YY +#endif + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE + MUL $f10, $f12, C + MADD $f10, $f13, S, $f10 + ST $f10, X, 0 * SIZE + addi.d I, I, -1 + MUL $f20, $f12, S + MSUB $f20, $f13, C, $f20 + ST $f20, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/rot_lsx.S b/kernel/loongarch64/rot_lsx.S new file mode 100644 index 000000000..4b0e59310 --- /dev/null +++ b/kernel/loongarch64/rot_lsx.S @@ -0,0 +1,1791 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VT0 $vr10 +#define VT1 $vr18 +#define VXC $vr23 +#define VXS $vr9 +#define VXZ $vr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT +#ifdef DOUBLE + movfr2gr.d t1, C + vreplgr2vr.d VXC, t1 + movfr2gr.d t2, S + vreplgr2vr.d VXS, t2 + movfr2gr.d t3, a1 + vreplgr2vr.d VXZ, t3 +#else + movfr2gr.s t1, C + vreplgr2vr.w VXC, t1 + movfr2gr.s t2, S + vreplgr2vr.w VXS, t2 + movfr2gr.s t3, a1 + vreplgr2vr.w VXZ, t3 +#endif + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE +#endif + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 +#ifdef DOUBLE + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 +#endif + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE +#ifdef DOUBLE + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE +#endif + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC +#ifdef DOUBLE + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC +#endif + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE +#ifdef DOUBLE + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE +#endif + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 +#ifdef DOUBLE + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 +#endif + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE +#ifdef DOUBLE + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, Y, 2 * SIZE +#endif + vst VXZ, X, 4 * SIZE + vst VXZ, Y, 4 * SIZE +#ifdef DOUBLE + vst VXZ, X, 6 * SIZE + vst VXZ, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L120 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L122 // C!=0 S==0 + b .L121 // C!=0 S!=0 + .align 3 + +.L120: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L124 // C==0 S==0 + b .L123 // C==0 S!=0 + .align 3 + +.L121: // C!=0 S!=0 +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE +#endif + add.d Y, Y, INCY +#ifndef DOUBLE + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY +#endif + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vst VT0, X, 0 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 +#endif + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vst VT0, X, 4 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // C!=0 S==0 +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 +#else + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 +#endif + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vst VT0, X, 0 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 4 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 4 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // C==0 S!=0 +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 +#else + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 +#endif + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 0 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 4 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 4 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, X, 4 * SIZE +#ifdef DOUBLE + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L124 +#ifdef DOUBLE + move Y, YY +#endif + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L210 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L212 // C!=0 S==0 + b .L211 // C!=0 S!=0 + .align 3 + +.L210: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L214 // C==0 S==0 + b .L213 // C==0 S!=0 + .align 3 + +.L211: // C!=0 S!=0 +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 +#endif + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VXS, VX0 + VMSUB VT1, VX2, VXC, VT1 +#ifdef DOUBLE + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VXS, VX0 + VMSUB VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 6 * SIZE +#else + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // C!=0 S==0 +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 +#endif + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VMUL VT1, VX2, VXC + +#ifdef DOUBLE + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VMUL VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXS + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXS + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + vst VT1, Y, 6 * SIZE +#else + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXS + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // C==0 S!=0 +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 +#endif + add.d X, X, INCX + VMUL VT0, VXS, VX2 + VMUL VT1, VXS, VX0 + VFSUB VT1, VXZ, VT1 + +#ifdef DOUBLE + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + VMUL VT0, VXS, VX2 + VMUL VT1, VXS, VX0 + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 6 * SIZE +#else + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // C==0 S==0 +#ifdef DOUBLE + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 +#else + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 +#endif + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 +#ifdef DOUBLE + move X, XX +#endif + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY +#endif + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX +#ifndef DOUBLE + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 +#ifdef DOUBLE + move X, XX + move Y, YY +#endif + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE + MUL $f10, $f12, C + MADD $f10, $f13, S, $f10 + ST $f10, X, 0 * SIZE + addi.d I, I, -1 + MUL $f20, $f12, S + MSUB $f20, $f13, C, $f20 + ST $f20, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/scal_lasx.S b/kernel/loongarch64/scal_lasx.S new file mode 100644 index 000000000..48e2c0718 --- /dev/null +++ b/kernel/loongarch64/scal_lasx.S @@ -0,0 +1,282 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VT0 $xr14 +#define VT1 $xr15 +#define VALPHA $xr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + movgr2fr.d a2, TEMP + FFINT a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + CMPEQ $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + MTG TEMP, ALPHA +#ifdef DOUBLE + xvreplgr2vr.d VALPHA, TEMP +#else + xvreplgr2vr.w VALPHA, TEMP +#endif + move XX, X + .align 3 + +.L10: //ALPHA!=0|1 and INCX!=1 + bge $r0, I, .L32 + .align 3 +.L11: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvfmul.d VT1, VX1, VALPHA + xvstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VT0, VX0, VALPHA + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 +#endif + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L23: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 +.L25: + xvxor.v VX0, VX0, VX0 + xvst VX0, X, 0 * SIZE +#ifdef DOUBLE + xvst VX0, X, 4 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L27: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + MTG TEMP, ALPHA +#ifdef DOUBLE + xvreplgr2vr.d VALPHA , TEMP +#else + xvreplgr2vr.w VALPHA , TEMP +#endif + .align 3 + +.L31: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvfmul.d VT0, VX0, VALPHA + xvfmul.d VT1, VX1, VALPHA + xvst VT0, X, 0 * SIZE + xvst VT1, X, 4 * SIZE +#else + xvfmul.s VT0, VX0, VALPHA + xvst VT0, X, 0 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L33: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + MUL a1, ALPHA, a1 + ST a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/scal_lsx.S b/kernel/loongarch64/scal_lsx.S new file mode 100644 index 000000000..1ffce7db2 --- /dev/null +++ b/kernel/loongarch64/scal_lsx.S @@ -0,0 +1,301 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VT0 $vr14 +#define VT1 $vr15 +#define VALPHA $vr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + movgr2fr.d a2, TEMP + FFINT a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + CMPEQ $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + MTG TEMP, ALPHA +#ifdef DOUBLE + vreplgr2vr.d VALPHA, TEMP +#else + vreplgr2vr.w VALPHA, TEMP +#endif + move XX, X + .align 3 + +.L10: //ALPHA!=0|1 and INCX!=1 + bge $r0, I, .L32 + .align 3 + +.L11: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT1, VX1, VALPHA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT1, VX1, VALPHA + vstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT1, XX, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + vfmul.s VT0, VX0, VALPHA + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vfmul.s VT1, VX1, VALPHA + vstelm.w VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 3 +#endif + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L23: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 + +.L25: + vxor.v VX0, VX0, VX0 + vst VX0, X, 0 * SIZE +#ifdef DOUBLE + vst VX0, X, 2 * SIZE + vst VX0, X, 4 * SIZE + vst VX0, X, 6 * SIZE +#else + vst VX0, X, 4 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L27: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + MTG TEMP, ALPHA +#ifdef DOUBLE + vreplgr2vr.d VALPHA , TEMP +#else + vreplgr2vr.w VALPHA , TEMP +#endif + .align 3 + +.L31: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vfmul.d VT0, VX0, VALPHA + vfmul.d VT1, VX1, VALPHA + vld VX0, X, 4 * SIZE + vst VT0, X, 0 * SIZE + vst VT1, X, 2 * SIZE + vfmul.d VT0, VX0, VALPHA + vld VX1, X, 6 * SIZE + vst VT0, X, 4 * SIZE + vfmul.d VT1, VX1, VALPHA + vst VT1, X, 6 * SIZE + addi.d I, I, -1 +#else + vld VX1, X, 4 * SIZE + vfmul.s VT0, VX0, VALPHA + vfmul.s VT1, VX1, VALPHA + addi.d I, I, -1 + vst VT0, X, 0 * SIZE + vst VT1, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L33: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + MUL a1, ALPHA, a1 + ST a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/snrm2_lasx.S b/kernel/loongarch64/snrm2_lasx.S new file mode 100644 index 000000000..3ae11e897 --- /dev/null +++ b/kernel/loongarch64/snrm2_lasx.S @@ -0,0 +1,153 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 + +/* Don't change following FR unless you know the effects. */ +#define VX0 $xr15 +#define VX1 $xr16 +#define VX2 $xr17 +#define VX3 $xr18 +#define VX4 $xr21 +#define res1 $xr19 +#define res2 $xr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + xvld VX0, X, 0 + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + .align 3 + b .L996 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + fadd.d $f19, $f19, $f16 + fadd.d $f19, $f19, $f17 + fadd.d $f19, $f19, $f18 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s $f15, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + fcvt.d.s $f15, $f15 + fmadd.d $f19, $f15, $f15, $f19 + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + move $r4, $r17 + fcvt.s.d $f0, $f19 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/snrm2_lsx.S b/kernel/loongarch64/snrm2_lsx.S new file mode 100644 index 000000000..bb492dbf0 --- /dev/null +++ b/kernel/loongarch64/snrm2_lsx.S @@ -0,0 +1,159 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define VX0 $vr15 +#define VX1 $vr16 +#define VX2 $vr17 +#define VX3 $vr18 +#define VX4 $vr21 +#define VX5 $vr22 +/* Don't change following FR unless you know the effects. */ +#define res1 $vr19 +#define res2 $vr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + vld VX0, X, 0 + vld VX5, X, 4 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfcvtl.d.s VX3, VX5 + vfcvth.d.s VX4, VX5 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 + blt $r0, I, .L10 + b .L996 + .align 3 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s $f15, X, 0 + addi.d I, I, -1 + fcvt.d.s $f15, $f15 + fmadd.d $f19, $f15, $f15, $f19 + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + move $r4, $r17 + fcvt.s.d $f0, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/sum_lasx.S b/kernel/loongarch64/sum_lasx.S new file mode 100644 index 000000000..fd6d5adb3 --- /dev/null +++ b/kernel/loongarch64/sum_lasx.S @@ -0,0 +1,225 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + xvld VX0, X, 0 + xvfadd.s res1, res1, VX0 +#ifdef DOUBLE + xvld VX1, X, 32 + xvfadd.s res1, res1, VX1 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfadd.s res1, VX0, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/sum_lsx.S b/kernel/loongarch64/sum_lsx.S new file mode 100644 index 000000000..6b2027781 --- /dev/null +++ b/kernel/loongarch64/sum_lsx.S @@ -0,0 +1,204 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + vld VX0, X, 0 + vld VX1, X, 16 + VFADD res2, VX0, VX1 + VFADD res1, res1, res2 +#ifdef DOUBLE + vld VX0, X, 32 + vld VX1, X, 48 + VFADD res2, VX0, VX1 + VFADD res1, res1, res2 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, res1, VX1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/swap_lasx.S b/kernel/loongarch64/swap_lasx.S new file mode 100644 index 000000000..4767fffe3 --- /dev/null +++ b/kernel/loongarch64/swap_lasx.S @@ -0,0 +1,401 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 + xvld VX2, Y, 0 + addi.d I, I, -1 + xvst VX2, X, 0 + xvst VX0, Y, 0 +#ifdef DOUBLE + xvld VX0, X, 32 + xvld VX2, Y, 32 + xvst VX2, X, 32 + xvst VX0, Y, 32 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 + ld.d t1, Y, 0 + xvstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + xvstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 + xvstelm.d VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.d t4, Y, 0 + xvstelm.d VX0, Y, 0, 3 + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvst VX2, X, 0 + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 + xvstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + xvstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 + xvstelm.d VX1, Y, 0, 2 + add.d Y, Y, INCY + ld.d t4, Y, 0 + xvstelm.d VX1, Y, 0, 3 + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX0, X, 0 + ld.w t1, Y, 0 + xvstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 + xvstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 + xvstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 + xvstelm.w VX0, Y, 0, 3 + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 + xvstelm.w VX0, Y, 0, 4 + add.d Y, Y, INCY + ld.w t2, Y, 0 + xvstelm.w VX0, Y, 0, 5 + add.d Y, Y, INCY + ld.w t3, Y, 0 + xvstelm.w VX0, Y, 0, 6 + add.d Y, Y, INCY + ld.w t4, Y, 0 + xvstelm.w VX0, Y, 0, 7 + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvst VX2, X, 0 + addi.d X, X, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 + ld.d t1, X, 0 + xvstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + xvstelm.d VX2, X, 0, 1 + add.d X, X, INCX + ld.d t3, X, 0 + xvstelm.d VX2, X, 0, 2 + add.d X, X, INCX + ld.d t4, X, 0 + xvstelm.d VX2, X, 0, 3 + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvst VX0, Y, 0 + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 + xvstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + xvstelm.d VX3, X, 0, 1 + add.d X, X, INCX + ld.d t3, X, 0 + xvstelm.d VX3, X, 0, 2 + add.d X, X, INCX + ld.d t4, X, 0 + xvstelm.d VX3, X, 0, 3 + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX2, Y, 0 + ld.w t1, X, 0 + xvstelm.w VX2, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 + xvstelm.w VX2, X, 0, 1 + add.d X, X, INCX + ld.w t3, X, 0 + xvstelm.w VX2, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 + xvstelm.w VX2, X, 0, 3 + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 + xvstelm.w VX2, X, 0, 4 + add.d X, X, INCX + ld.w t2, X, 0 + xvstelm.w VX2, X, 0, 5 + add.d X, X, INCX + ld.w t3, X, 0 + xvstelm.w VX2, X, 0, 6 + add.d X, X, INCX + ld.w t4, X, 0 + xvstelm.w VX2, X, 0, 7 + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvst VX0, Y, 0 + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD b3, Y, 0 + ST a3, Y, 0 + add.d Y, Y, INCY + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + ST b1, XX, 0 + add.d XX, XX, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD a2, X, 0 + add.d X, X, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD a3, X, 0 + add.d X, X, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + LD b3, Y, 0 + ST a3, Y, 0 + LD a4, X, 0 + add.d X, X, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + ST b1, XX, 0 + add.d XX, XX, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/swap_lsx.S b/kernel/loongarch64/swap_lsx.S new file mode 100644 index 000000000..736187f93 --- /dev/null +++ b/kernel/loongarch64/swap_lsx.S @@ -0,0 +1,431 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and incy==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, Y, 0 + vld VX3, Y, 16 + addi.d I, I, -1 + vst VX2, X, 0 + vst VX3, X, 16 + vst VX0, Y, 0 + vst VX1, Y, 16 +#ifdef DOUBLE + vld VX0, X, 32 + vld VX1, X, 48 + vld VX2, Y, 32 + vld VX3, Y, 48 + vst VX2, X, 32 + vst VX3, X, 48 + vst VX0, Y, 32 + vst VX1, Y, 48 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: +#ifdef DOUBLE + fld.d $f12, X, 0 + fld.d $f14, Y, 0 + addi.d I, I, -1 + fst.d $f12, Y, 0 + fst.d $f14, X, 0 +#else + fld.s $f12, X, 0 + fld.s $f14, Y, 0 + addi.d I, I, -1 + fst.s $f12, Y, 0 + fst.s $f14, X, 0 +#endif + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 + ld.d t1, Y, 0 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + vstelm.d VX0, Y, 0, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 0 + vld VX1, X, 2 * SIZE + ld.d t3, Y, 0 + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t4, Y, 0 + vstelm.d VX1, Y, 0, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + ld.d t1, Y, 0 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + vstelm.d VX0, Y, 0, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t4, Y, 0 + vstelm.d VX1, Y, 0, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 6 * SIZE + addi.d X, X, 8 * SIZE +#else + vld VX0, X, 0 + ld.w t1, Y, 0 + vstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 + vstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 + vstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 + vstelm.w VX0, Y, 0, 3 + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vst VX2, X, 0 + + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 + vstelm.w VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 + vstelm.w VX1, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 + vstelm.w VX1, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 + vstelm.w VX1, Y, 0, 3 + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 + ld.d t1, X, 0 + vstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + vstelm.d VX2, X, 0, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 0 + vld VX3, Y, 2 * SIZE + ld.d t3, X, 0 + vstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t4, X, 0 + vstelm.d VX3, X, 0, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + ld.d t1, X, 0 + vstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + vstelm.d VX2, X, 0, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 + vstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t4, X, 0 + vstelm.d VX3, X, 0, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE +#else + vld VX2, Y, 0 + ld.w t1, X, 0 + vstelm.w VX2, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 + vstelm.w VX2, X, 0, 1 + add.d X, X, INCX + ld.w t3, X, 0 + vstelm.w VX2, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 + vstelm.w VX2, X, 0, 3 + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vst VX0, Y, 0 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 + vstelm.w VX3, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 + vstelm.w VX3, X, 0, 1 + add.d X, X, INCX + ld.w t3, X, 0 + vstelm.w VX3, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 + vstelm.w VX3, X, 0, 3 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + ST $f12, Y, 0 * SIZE + ST $f14, X, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD b3, Y, 0 + ST a3, Y, 0 + add.d Y, Y, INCY + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + ST b1, XX, 0 + add.d XX, XX, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD a2, X, 0 + add.d X, X, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD a3, X, 0 + add.d X, X, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + LD b3, Y, 0 + ST a3, Y, 0 + LD a4, X, 0 + add.d X, X, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + ST b1, XX, 0 + add.d XX, XX, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/zgemm_kernel_2x2.S b/kernel/loongarch64/zgemm_kernel_2x2.S new file mode 100644 index 000000000..589d170c5 --- /dev/null +++ b/kernel/loongarch64/zgemm_kernel_2x2.S @@ -0,0 +1,848 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 +#define c51 $f26 +#define c52 $f27 +#define c61 $f28 +#define c62 $f29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -88 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + ST $f23, $sp, 32 + ST $f24, $sp, 40 + ST $f25, $sp, 48 + ST $f26, $sp, 56 + ST $f27, $sp, 64 + ST $f28, $sp, 72 + ST $f29, $sp, 80 + + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, BASE_SHIFT + + move J, $r0 + srai.d T0, N, 1 + beq J, T0, .L19 + +.L10: /* for(j=0; j 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + xvand.v D2, U2, U2 + xvand.v D3, U3, U3 + + xvpermi.q D0, U2, 0x02 + xvpermi.q D2, U0, 0x31 + xvpermi.q D1, U3, 0x02 + xvpermi.q D3, U1, 0x31 + + xvst D0, TD, 0x00 + xvst D2, TD, 0x20 + xvst D1, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x40 // a_offset1 + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + xvld U0, TS, 0x00 + xvld U1, TS, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d TS, TS, 0x40 // a_offset + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + vld $vr0, TS, 0x00 + vst $vr0, TD, 0x00 + + + addi.d TS, TS, 0x10 // a_offset + addi.d TD, TD, 0x10 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_2_lasx.S b/kernel/loongarch64/zgemm_tcopy_2_lasx.S new file mode 100644 index 000000000..3fe17beef --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_2_lasx.S @@ -0,0 +1,212 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, S8, 0x00 + xvst U2, S8, 0x20 + + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + xvst U1, S8, 0x00 + xvst U3, S8, 0x20 + + add.d S8, S8, T0 + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S9, 0x00 + vst $vr1, S9, 0x10 + + addi.d S9, S9, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + xvld U0, TS, 0x00 + xvld U1, TS, 0x20 + + xvst U0, TD, 0x00 + + slli.d T0, M, 0x05 + add.d TD, TD, T0 + + xvst U1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + xvld U0, TS, 0x00 + + xvst U0, TD, 0x00 + + addi.d TS, TS, 0x20 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld $vr0, TS, 0x00 + + vst $vr0, S9, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/znrm2_lasx.S b/kernel/loongarch64/znrm2_lasx.S new file mode 100644 index 000000000..53f8a6e05 --- /dev/null +++ b/kernel/loongarch64/znrm2_lasx.S @@ -0,0 +1,252 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 +#define a3 $f15 +#define a2 $f16 +#define VX0 $xr15 +#define VX1 $xr16 +#define VM0 $xr17 +#define VM1 $xr18 +#define VM2 $xr13 +#define VM3 $xr14 +#define res1 $xr19 +#define res2 $xr20 +#define VALPHA $xr21 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v VM0, VM0, VM0 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L97 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmaxa.d VM1, VX1, VX0 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L97 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + xvpickve.d VX0, VM0, 1 + xvpickve.d VX1, VM0, 2 + xvpickve.d VM3, VM0, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM2, VM3, VM0 + xvfmaxa.d VM0, VM1, VM2 + .align 3 + +.L97: + andi I, N, 3 + bge $r0, I, .L99 + .align 3 + +.L98: + fld.d a3, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fmaxa.d a3, a2, a3 + fmaxa.d max, a3, max + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + xvreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L120 + bge $r0, I, .L997 + .align 3 + +.L110: + xvld VX0, XX, 0 * SIZE + xvld VX1, XX, 4 * SIZE + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + bge $r0, I, .L997 + .align 3 + +.L121: + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX0, res1, 1 + xvpickve.d VX1, res1, 2 + xvpickve.d VM2, res1, 3 + xvfadd.d res1, VX0, res1 + xvfadd.d VX1, VX1, VM2 + xvfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d a3, XX, 0 * SIZE + fld.d a2, XX, 1 * SIZE + addi.d I, I, -1 + fmul.d a3, a3, ALPHA + fmadd.d $f19, a3, a3, $f19 + fmul.d a2, a2, ALPHA + fmadd.d $f19, a2, a2, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/znrm2_lsx.S b/kernel/loongarch64/znrm2_lsx.S new file mode 100644 index 000000000..14c59d504 --- /dev/null +++ b/kernel/loongarch64/znrm2_lsx.S @@ -0,0 +1,260 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 +#define a3 $f15 +#define a2 $f16 +#define VX0 $vr15 +#define VX1 $vr16 +#define VM0 $vr17 +#define VM1 $vr18 +#define VM2 $vr13 +#define VM3 $vr14 +#define res1 $vr19 +#define res2 $vr20 +#define VALPHA $vr21 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v VM0, VM0, VM0 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L97 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmaxa.d VM1, VX1, VX0 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmaxa.d VM2, VX1, VX0 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L97 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + vfmaxa.d VM0, VX0, VX1 + .align 3 + +.L97: + andi I, N, 3 + bge $r0, I, .L99 + .align 3 + +.L98: + fld.d a3, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fmaxa.d a3, a2, a3 + fmaxa.d max, a3, max + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + vreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L120 + bge $r0, I, .L997 + .align 3 + +.L110: + vld VX0, XX, 0 * SIZE + vld VX1, XX, 2 * SIZE + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + vld VX0, XX, 4 * SIZE + vld VX1, XX, 6 * SIZE + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + bge $r0, I, .L997 + .align 3 + +.L121: + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM2, VX0, VALPHA + ld.d t1, XX, 0 * SIZE + vfmul.d VM3, VX1, VALPHA + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d a3, XX, 0 * SIZE + fld.d a2, XX, 1 * SIZE + addi.d I, I, -1 + fmul.d a3, a3, ALPHA + fmadd.d $f19, a3, a3, $f19 + fmul.d a2, a2, ALPHA + fmadd.d $f19, a2, a2, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970 index a99fb7d96..fee5fa529 100644 --- a/kernel/power/KERNEL.PPC970 +++ b/kernel/power/KERNEL.PPC970 @@ -1,11 +1,11 @@ ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) SGEMMKERNEL = gemm_kernel.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) else diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4 index 1bdd3119e..c73601cee 100644 --- a/kernel/power/KERNEL.PPCG4 +++ b/kernel/power/KERNEL.PPCG4 @@ -96,9 +96,9 @@ CGEMMINCOPY = CGEMMONCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMINCOPYOBJ = +CGEMMINCOPYOBJ = #cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = +CGEMMITCOPYOBJ = #cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/power/sgemm_tcopy_16_power8.S b/kernel/power/sgemm_tcopy_16_power8.S index b9f6d63fb..6d2c6a555 100644 --- a/kernel/power/sgemm_tcopy_16_power8.S +++ b/kernel/power/sgemm_tcopy_16_power8.S @@ -108,6 +108,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define o0 0 +#ifdef POWER10 +#include "sgemm_tcopy_macros_16_power10.S" +#endif #include "sgemm_tcopy_macros_16_power8.S" #define STACKSIZE 144 diff --git a/kernel/power/sgemm_tcopy_macros_16_power10.S b/kernel/power/sgemm_tcopy_macros_16_power10.S new file mode 100644 index 000000000..dca37e48a --- /dev/null +++ b/kernel/power/sgemm_tcopy_macros_16_power10.S @@ -0,0 +1,323 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x16', ` +#else +.macro COPY_4x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + + lxvpx vs36, o0, A1 + lxvpx vs38, o32, A1 + + lxvpx vs40, o0, A2 + lxvpx vs42, o32, A2 + + lxvpx vs44, o0, A3 + lxvpx vs46, o32, A3 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs36, o0, T1 + stxvx vs37, o16, T1 + stxvx vs38, o32, T1 + stxvx vs39, o48, T1 +#else + stxvx vs37, o0, T1 + stxvx vs36, o16, T1 + stxvx vs39, o32, T1 + stxvx vs38, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs40, o0, T1 + stxvx vs41, o16, T1 + stxvx vs42, o32, T1 + stxvx vs43, o48, T1 +#else + stxvx vs41, o0, T1 + stxvx vs40, o16, T1 + stxvx vs43, o32, T1 + stxvx vs42, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs44, o0, T1 + stxvx vs45, o16, T1 + stxvx vs46, o32, T1 + stxvx vs47, o48, T1 +#else + stxvx vs45, o0, T1 + stxvx vs44, o16, T1 + stxvx vs47, o32, T1 + stxvx vs46, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x8', ` +#else +.macro COPY_4x8 +#endif + + lxvpx vs32, o0, A0 + + lxvpx vs34, o0, A1 + + lxvpx vs36, o0, A2 + + lxvpx vs38, o0, A3 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs36, o0, T1 + stxvx vs37, o16, T1 + + stxvx vs38, o32, T1 + stxvx vs39, o48, T1 +#else + stxvx vs37, o0, T1 + stxvx vs36, o16, T1 + + stxvx vs39, o32, T1 + stxvx vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x16', ` +#else +.macro COPY_2x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + + lxvpx vs36, o0, A1 + lxvpx vs38, o32, A1 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs36, o0, T1 + stxvx vs37, o16, T1 + stxvx vs38, o32, T1 + stxvx vs39, o48, T1 +#else + stxvx vs37, o0, T1 + stxvx vs36, o16, T1 + stxvx vs39, o32, T1 + stxvx vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x8', ` +#else +.macro COPY_2x8 +#endif + + lxvpx vs32, o0, A0 + + lxvpx vs34, o0, A1 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x16', ` +#else +.macro COPY_1x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x8', ` +#else +.macro COPY_1x8 +#endif + + lxvpx vs32, o0, A0 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + diff --git a/kernel/power/sgemm_tcopy_macros_16_power8.S b/kernel/power/sgemm_tcopy_macros_16_power8.S index ed592a604..af237d5ee 100644 --- a/kernel/power/sgemm_tcopy_macros_16_power8.S +++ b/kernel/power/sgemm_tcopy_macros_16_power8.S @@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_4x16', ` #else @@ -141,6 +142,7 @@ define(`COPY_4x8', ` #else .endm #endif +#endif /********************************************************************************************** * Macros for N=4 and M=4 @@ -264,6 +266,7 @@ define(`COPY_4x1', ` * Macros for N=2 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_2x16', ` #else @@ -329,6 +332,7 @@ define(`COPY_2x8', ` #else .endm #endif +#endif /********************************************************************************************** * Macros for N=2 and M=4 @@ -418,6 +422,7 @@ define(`COPY_2x1', ` * Macros for N=1 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_1x16', ` #else @@ -465,6 +470,7 @@ define(`COPY_1x8', ` #else .endm #endif +#endif /********************************************************************************************** * Macros for N=1 and M=4 diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index bea7036c2..f8278c3b4 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -405,7 +405,7 @@ DGEMVNKERNEL = dgemv_n.S endif ifndef DGEMVTKERNEL -DGEMVTKERNEL = dgemv_t.S +DGEMVTKERNEL = dgemv_t_4.c endif ifndef CGEMVNKERNEL diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c index e4d054311..28d78d279 100644 --- a/kernel/x86_64/casum.c +++ b/kernel/x86_64/casum.c @@ -9,12 +9,12 @@ #endif #ifndef HAVE_CASUM_KERNEL -static FLOAT casum_kernel(BLASLONG n, FLOAT *x1) +static FLOAT casum_kernel(BLASLONG n, FLOAT *x) { BLASLONG i=0; BLASLONG n_8 = n & -8; - FLOAT *x = x1; + FLOAT *x1 = x; FLOAT temp0, temp1, temp2, temp3; FLOAT temp4, temp5, temp6, temp7; FLOAT sum0 = 0.0; @@ -24,14 +24,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x1) FLOAT sum4 = 0.0; while (i < n_8) { - temp0 = ABS_K(x[0]); - temp1 = ABS_K(x[1]); - temp2 = ABS_K(x[2]); - temp3 = ABS_K(x[3]); - temp4 = ABS_K(x[4]); - temp5 = ABS_K(x[5]); - temp6 = ABS_K(x[6]); - temp7 = ABS_K(x[7]); + temp0 = ABS_K(x1[0]); + temp1 = ABS_K(x1[1]); + temp2 = ABS_K(x1[2]); + temp3 = ABS_K(x1[3]); + temp4 = ABS_K(x1[4]); + temp5 = ABS_K(x1[5]); + temp6 = ABS_K(x1[6]); + temp7 = ABS_K(x1[7]); sum0 += temp0; sum1 += temp1; @@ -43,12 +43,12 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x1) sum2 += temp6; sum3 += temp7; - x+=8; + x1+=8; i+=4; } while (i < n) { - sum4 += (ABS_K(x1[0]) + ABS_K(x1[1])); + sum4 += ABS_K(x1[0]) + ABS_K(x1[1]); x1 += 2; i++; } diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c index d261962de..10b70ff20 100644 --- a/kernel/x86_64/casum_microk_skylakex-2.c +++ b/kernel/x86_64/casum_microk_skylakex-2.c @@ -2,9 +2,9 @@ #ifdef __NVCOMPILER #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) #endif -#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && (__clang_major__ >= 9 &&__clang_major__ !=17)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309))) +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203)) -#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309)) +#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203)) #define HAVE_CASUM_KERNEL 1 @@ -20,15 +20,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x) if (n2 < 64) { __m128 accum_10, accum_11, accum_12, accum_13; - __m128 abs_mask1 = abs_mask1; + __m128 abs_mask1; accum_10 = _mm_setzero_ps(); accum_11 = _mm_setzero_ps(); accum_12 = _mm_setzero_ps(); accum_13 = _mm_setzero_ps(); - abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1); - abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1); + abs_mask1 = (__m128)_mm_set1_epi32(0x7fffffff); _mm_prefetch(&x1[0], _MM_HINT_T0); diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c index dddf03fe2..f6bc8e37b 100644 --- a/kernel/x86_64/zasum_microk_skylakex-2.c +++ b/kernel/x86_64/zasum_microk_skylakex-2.c @@ -2,9 +2,9 @@ #ifdef __NVCOMPILER #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) #endif -#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && ( __clang_major__ >= 9 && __clang_major__ != 17)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309))) +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203)) -#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309)) +#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203)) #define HAVE_ZASUM_KERNEL 1 @@ -21,16 +21,14 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) if (n2 < 32) { __m128d accum_10, accum_11, accum_12, accum_13; - __m128d abs_mask1 = abs_mask1; + __m128d abs_mask1; accum_10 = _mm_setzero_pd(); accum_11 = _mm_setzero_pd(); accum_12 = _mm_setzero_pd(); accum_13 = _mm_setzero_pd(); - // abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff); - abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1); - abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1); + abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff); _mm_prefetch(&x1[0], _MM_HINT_T0); if (n2 >= 16){ diff --git a/lapack-netlib/CMakeLists.txt b/lapack-netlib/CMakeLists.txt index fefaa8b89..f1f47ae24 100644 --- a/lapack-netlib/CMakeLists.txt +++ b/lapack-netlib/CMakeLists.txt @@ -1,9 +1,9 @@ -cmake_minimum_required(VERSION 3.2) +cmake_minimum_required(VERSION 3.6) -project(LAPACK Fortran C) +project(LAPACK) set(LAPACK_MAJOR_VERSION 3) -set(LAPACK_MINOR_VERSION 11) +set(LAPACK_MINOR_VERSION 12) set(LAPACK_PATCH_VERSION 0) set( LAPACK_VERSION @@ -45,6 +45,14 @@ if(_is_coverage_build) find_package(codecov) endif() +# Use valgrind if it is found +option( LAPACK_TESTING_USE_PYTHON "Use Python for testing. Disable it on memory checks." ON ) +find_program( MEMORYCHECK_COMMAND valgrind ) +if( MEMORYCHECK_COMMAND ) + message( STATUS "Found valgrind: ${MEMORYCHECK_COMMAND}" ) + set( MEMORYCHECK_COMMAND_OPTIONS "--leak-check=full --show-leak-kinds=all --track-origins=yes" ) +endif() + # By default test Fortran compiler complex abs and complex division option(TEST_FORTRAN_COMPILER "Test Fortran compiler complex abs and complex division" OFF) if( TEST_FORTRAN_COMPILER ) @@ -76,7 +84,7 @@ if( TEST_FORTRAN_COMPILER ) WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL COMMENT "Running test_zminMax in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zminMax.err" SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zminMax.f ) - + endif() # By default static library @@ -99,6 +107,8 @@ else() set(LAPACKELIB "lapacke") set(TMGLIB "tmglib") endif() +# By default build standard API and extended _64 API +option(BUILD_INDEX64_EXT_API "Build Index-64 API as extended API with _64 suffix" ON) include(GNUInstallDirs) @@ -127,90 +137,6 @@ configure_file( include(PreventInSourceBuilds) include(PreventInBuildInstalls) -# Check if recursive flag exists -include(CheckFortranCompilerFlag) -if(CMAKE_Fortran_COMPILER_ID STREQUAL Flang) - check_fortran_compiler_flag("-Mrecursive" _MrecursiveFlag) -elseif(CMAKE_Fortran_COMPILER_ID STREQUAL GNU) - check_fortran_compiler_flag("-frecursive" _frecursiveFlag) -elseif(CMAKE_Fortran_COMPILER_ID STREQUAL Intel) - check_fortran_compiler_flag("-recursive" _recursiveFlag) -elseif(CMAKE_Fortran_COMPILER_ID STREQUAL XL) - check_fortran_compiler_flag("-qrecur" _qrecurFlag) -elseif(CMAKE_Fortran_COMPILER_ID STREQUAL NAG) - check_fortran_compiler_flag("-recursive" _recursiveFlag) -else() - message(WARNING "Fortran local arrays should be allocated on the stack." - " Please use a compiler which guarantees that feature." - " See https://github.com/Reference-LAPACK/lapack/pull/188 and references therein.") -endif() - -# Add recursive flag -if(_MrecursiveFlag) - string(REGEX MATCH "-Mrecursive" output_test "${CMAKE_Fortran_FLAGS}") - if(NOT output_test) - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mrecursive" - CACHE STRING "Recursive flag must be set" FORCE) - endif() -elseif(_frecursiveFlag) - string(REGEX MATCH "-frecursive" output_test "${CMAKE_Fortran_FLAGS}") - if(NOT output_test) - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -frecursive" - CACHE STRING "Recursive flag must be set" FORCE) - endif() -elseif(_recursiveFlag) - string(REGEX MATCH "-recursive" output_test "${CMAKE_Fortran_FLAGS}") - if(NOT output_test) - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -recursive" - CACHE STRING "Recursive flag must be set" FORCE) - endif() -elseif(_qrecurFlag) - string(REGEX MATCH "-qrecur" output_test "${CMAKE_Fortran_FLAGS}") - if(NOT output_test) - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qrecur" - CACHE STRING "Recursive flag must be set" FORCE) - endif() -endif() - -if(UNIX) - if(CMAKE_Fortran_COMPILER_ID STREQUAL Intel) - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict") - endif() - if(CMAKE_Fortran_COMPILER_ID STREQUAL XL) - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict") - endif() -# Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler. -# This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin - string(REPLACE \;mtsk\; \; CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES "${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES}") -endif() - -if(CMAKE_Fortran_COMPILER_ID STREQUAL Compaq) - if(WIN32) - if(CMAKE_GENERATOR STREQUAL "NMake Makefiles") - get_filename_component(CMAKE_Fortran_COMPILER_CMDNAM ${CMAKE_Fortran_COMPILER} NAME_WE) - message(STATUS "Using Compaq Fortran compiler with command name ${CMAKE_Fortran_COMPILER_CMDNAM}") - set(cmd ${CMAKE_Fortran_COMPILER_CMDNAM}) - string(TOLOWER "${cmd}" cmdlc) - if(cmdlc STREQUAL "df") - message(STATUS "Assume the Compaq Visual Fortran Compiler is being used") - set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) - set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_INCLUDES 1) - #This is a workaround that is needed to avoid forward-slashes in the - #filenames listed in response files from incorrectly being interpreted as - #introducing compiler command options - if(${BUILD_SHARED_LIBS}) - message(FATAL_ERROR "Making of shared libraries with CVF has not been tested.") - endif() - set(str "NMake version 9 or later should be used. NMake version 6.0 which is\n") - set(str "${str} included with the CVF distribution fails to build Lapack because\n") - set(str "${str} the number of source files exceeds the limit for NMake v6.0\n") - message(STATUS ${str}) - set(CMAKE_Fortran_LINK_EXECUTABLE "LINK /out: ") - endif() - endif() - endif() -endif() - # Add option to enable flat namespace for symbol resolution on macOS if(APPLE) option(USE_FLAT_NAMESPACE "Use flat namespaces for symbol resolution during build and runtime." OFF) @@ -268,26 +194,6 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/bin) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/lib) -# -------------------------------------------------- -# Check for any necessary platform specific compiler flags -include(CheckLAPACKCompilerFlags) -CheckLAPACKCompilerFlags() - -# -------------------------------------------------- -# Check second function - -include(CheckTimeFunction) -set(TIME_FUNC NONE) -CHECK_TIME_FUNCTION(NONE TIME_FUNC) -CHECK_TIME_FUNCTION(INT_CPU_TIME TIME_FUNC) -CHECK_TIME_FUNCTION(EXT_ETIME TIME_FUNC) -CHECK_TIME_FUNCTION(EXT_ETIME_ TIME_FUNC) -CHECK_TIME_FUNCTION(INT_ETIME TIME_FUNC) -message(STATUS "--> Will use second_${TIME_FUNC}.f and dsecnd_${TIME_FUNC}.f as timing function.") - -set(SECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/second_${TIME_FUNC}.f) -set(DSECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/dsecnd_${TIME_FUNC}.f) - # deprecated LAPACK and LAPACKE routines option(BUILD_DEPRECATED "Build deprecated routines" OFF) message(STATUS "Build deprecated routines: ${BUILD_DEPRECATED}") @@ -380,18 +286,27 @@ endif() # Check the usage of the user provided or automatically found LAPACK libraries if(LAPACK_LIBRARIES) - include(CheckFortranFunctionExists) - set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES}) - # Check if new routine of 3.4.0 is in LAPACK_LIBRARIES - CHECK_FORTRAN_FUNCTION_EXISTS("dgeqrt" LATESTLAPACK_FOUND) - unset(CMAKE_REQUIRED_LIBRARIES) - if(LATESTLAPACK_FOUND) - message(STATUS "--> LAPACK supplied by user is WORKING, will use ${LAPACK_LIBRARIES}.") + include(CheckLanguage) + check_language(Fortran) + if(CMAKE_Fortran_COMPILER) + enable_language(Fortran) + include(CheckFortranFunctionExists) + set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES}) + # Check if new routine of 3.4.0 is in LAPACK_LIBRARIES + CHECK_FORTRAN_FUNCTION_EXISTS("dgeqrt" LATESTLAPACK_FOUND) + unset(CMAKE_REQUIRED_LIBRARIES) + if(LATESTLAPACK_FOUND) + message(STATUS "--> LAPACK supplied by user is WORKING, will use ${LAPACK_LIBRARIES}.") + else() + message(ERROR "--> LAPACK supplied by user is not WORKING or is older than LAPACK 3.4.0, CANNOT USE ${LAPACK_LIBRARIES}.") + message(ERROR "--> Will use REFERENCE LAPACK (by default)") + message(ERROR "--> Or Correct your LAPACK_LIBRARIES entry ") + message(ERROR "--> Or Consider checking USE_OPTIMIZED_LAPACK") + endif() else() - message(ERROR "--> LAPACK supplied by user is not WORKING or is older than LAPACK 3.4.0, CANNOT USE ${LAPACK_LIBRARIES}.") - message(ERROR "--> Will use REFERENCE LAPACK (by default)") - message(ERROR "--> Or Correct your LAPACK_LIBRARIES entry ") - message(ERROR "--> Or Consider checking USE_OPTIMIZED_LAPACK") + message(STATUS "--> LAPACK supplied by user is ${LAPACK_LIBRARIES}.") + message(STATUS "--> CMake couldn't find a Fortran compiler, so it cannot check if the provided LAPACK library works.") + set(LATESTLAPACK_FOUND TRUE) endif() endif() @@ -399,6 +314,27 @@ endif() if(NOT LATESTLAPACK_FOUND) message(STATUS "Using supplied NETLIB LAPACK implementation") set(LAPACK_LIBRARIES ${LAPACKLIB}) + + enable_language(Fortran) + + # Check for any necessary platform specific compiler flags + include(CheckLAPACKCompilerFlags) + CheckLAPACKCompilerFlags() + + # Check second function + include(CheckTimeFunction) + set(TIME_FUNC NONE) + CHECK_TIME_FUNCTION(NONE TIME_FUNC) + CHECK_TIME_FUNCTION(INT_CPU_TIME TIME_FUNC) + CHECK_TIME_FUNCTION(EXT_ETIME TIME_FUNC) + CHECK_TIME_FUNCTION(EXT_ETIME_ TIME_FUNC) + CHECK_TIME_FUNCTION(INT_ETIME TIME_FUNC) + + # Set second function + message(STATUS "--> Will use second_${TIME_FUNC}.f and dsecnd_${TIME_FUNC}.f as timing function.") + set(SECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/second_${TIME_FUNC}.f) + set(DSECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/dsecnd_${TIME_FUNC}.f) + add_subdirectory(SRC) else() set(CMAKE_EXE_LINKER_FLAGS @@ -431,9 +367,11 @@ endif() # Cache export target set(LAPACK_INSTALL_EXPORT_NAME_CACHE ${LAPACK_INSTALL_EXPORT_NAME}) if(BUILD_TESTING OR LAPACKE_WITH_TMG) + enable_language(Fortran) if(LATESTLAPACK_FOUND AND LAPACKE_WITH_TMG) set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES}) # Check if dlatms (part of tmg) is found + include(CheckFortranFunctionExists) CHECK_FORTRAN_FUNCTION_EXISTS("dlatms" LAPACK_WITH_TMGLIB_FOUND) unset(CMAKE_REQUIRED_LIBRARIES) if(NOT LAPACK_WITH_TMGLIB_FOUND) @@ -448,6 +386,12 @@ endif() set(LAPACK_INSTALL_EXPORT_NAME ${LAPACK_INSTALL_EXPORT_NAME_CACHE}) unset(LAPACK_INSTALL_EXPORT_NAME_CACHE) + +#------------------------------------- +# LAPACKE +# Include lapack.h and lapacke_mangling.h even if LAPACKE is not built +add_subdirectory(LAPACKE/include) + if(LAPACKE) add_subdirectory(LAPACKE) endif() @@ -474,8 +418,8 @@ if (BLAS++) ExternalProject_Add(blaspp URL https://bitbucket.org/icl/blaspp/downloads/blaspp-2020.10.02.tar.gz CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/blaspp-prefix/src/blaspp - BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make - INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install + BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build . + INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install . ) ExternalProject_Add_StepDependencies(blaspp build ${BLAS_LIBRARIES}) endif() @@ -487,16 +431,16 @@ if (LAPACK++) ExternalProject_Add(lapackpp URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES=${LAPACK_LIBRARIES} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp - BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make - INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install + BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build . + INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install . ) else () # FIXME this does not really work as the libraries list gets converted to a semicolon-separated list somewhere in the lapack++ build files ExternalProject_Add(lapackpp URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz CONFIGURE_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES="${PROJECT_BINARY_DIR}/lib/liblapack.a -lgfortran" -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp - BUILD_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make - INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install + BUILD_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build . + INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install . ) endif() ExternalProject_Add_StepDependencies(lapackpp build blaspp ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) @@ -671,22 +615,34 @@ if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION) set(DOXYGEN_PROJECT_BRIEF "LAPACK: Linear Algebra PACKage") set(DOXYGEN_PROJECT_NUMBER ${LAPACK_VERSION}) set(DOXYGEN_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/DOCS) - set(PROJECT_LOGO ${CMAKE_CURRENT_SOURCE_DIR}/DOCS/lapack.png) + set(DOXYGEN_PROJECT_LOGO ${CMAKE_CURRENT_SOURCE_DIR}/DOCS/lapack.png) set(DOXYGEN_OPTIMIZE_FOR_FORTRAN YES) set(DOXYGEN_SOURCE_BROWSER YES) - set(DISTRIBUTE_GROUP_DOC YES) set(DOXYGEN_CREATE_SUBDIRS YES) set(DOXYGEN_SEPARATE_MEMBER_PAGES YES) set(DOXYGEN_EXTRACT_ALL YES) - set(DOXYGEN_FILE_PATTERNS "*.f;*.c;*.h") + set(DOXYGEN_FILE_PATTERNS *.f *.f90 *.c *.h ) set(DOXYGEN_RECURSIVE YES) set(DOXYGEN_GENERATE_TREEVIEW YES) + set(DOXYGEN_DOT_IMAGE_FORMAT svg) set(DOXYGEN_INTERACTIVE_SVG YES) - set(DOXYGEN_QUIET YES) - set(DOXYGEN_WARNINGS NO) + set(DOXYGEN_QUIET NO) + set(DOXYGEN_WARNINGS YES) + set(DOXYGEN_WARN_NO_PARAMDOC YES) + set(DOXYGEN_WARN_LOGFILE doxygen_error) set(DOXYGEN_GENERATE_HTML NO) set(DOXYGEN_GENERATE_MAN NO) - + set(DOXYGEN_LAYOUT_FILE "DOCS/DoxygenLayout.xml") + + # Exclude functions that are duplicated, creating conflicts. + set(DOXYGEN_EXCLUDE .git + .github + SRC/VARIANTS + BLAS/SRC/lsame.f + BLAS/SRC/xerbla.f + BLAS/SRC/xerbla_array.f + INSTALL/slamchf77.f + INSTALL/dlamchf77.f ) if (BUILD_HTML_DOCUMENTATION) set(DOXYGEN_GENERATE_HTML YES) @@ -697,13 +653,21 @@ if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION) doxygen_add_docs( html - ${PROJECT_SOURCE_DIR} + + # Doxygen INPUT = + ${PROJECT_SOURCE_DIR}/README.md + ${PROJECT_SOURCE_DIR}/BLAS + ${PROJECT_SOURCE_DIR}/CBLAS + ${PROJECT_SOURCE_DIR}/SRC + ${PROJECT_SOURCE_DIR}/INSTALL + ${PROJECT_SOURCE_DIR}/TESTING + ${PROJECT_SOURCE_DIR}/DOCS/groups-usr.dox + COMMENT "Generating html LAPACK documentation (it will take some time... time to grab a coffee)" ) endif() if (BUILD_MAN_DOCUMENTATION) set(DOXYGEN_GENERATE_MAN YES) - set(DOXYGEN_EXCLUDE SRC/VARIANTS) set(DOXYGEN_MAN_LINKS YES) set(DOXYGEN_INLINE_SOURCES NO) set(DOXYGEN_CALL_GRAPH NO) @@ -711,7 +675,15 @@ if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION) doxygen_add_docs( man - ${PROJECT_SOURCE_DIR} + + # Doxygen INPUT = + ${PROJECT_SOURCE_DIR}/BLAS + ${PROJECT_SOURCE_DIR}/CBLAS + ${PROJECT_SOURCE_DIR}/SRC + ${PROJECT_SOURCE_DIR}/INSTALL + ${PROJECT_SOURCE_DIR}/TESTING + ${PROJECT_SOURCE_DIR}/DOCS/groups-usr.dox + COMMENT "Generating man LAPACK documentation" ) endif() diff --git a/lapack-netlib/DOCS/Doxyfile b/lapack-netlib/DOCS/Doxyfile index 43cea43b5..577675772 100644 --- a/lapack-netlib/DOCS/Doxyfile +++ b/lapack-netlib/DOCS/Doxyfile @@ -1,4 +1,4 @@ -# Doxyfile 1.8.10 +# Doxyfile 1.9.1 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. @@ -17,11 +17,11 @@ # Project related configuration options #--------------------------------------------------------------------------- -# This tag specifies the encoding used for all characters in the config file -# that follow. The default is UTF-8 which is also the encoding used for all text -# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv -# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv -# for the list of possible encodings. +# This tag specifies the encoding used for all characters in the configuration +# file that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 @@ -38,7 +38,7 @@ PROJECT_NAME = LAPACK # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 3.9.0 +PROJECT_NUMBER = 3.12.0 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a @@ -93,6 +93,14 @@ ALLOW_UNICODE_NAMES = NO OUTPUT_LANGUAGE = English +# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all generated output in the proper direction. +# Possible values are: None, LTR, RTL and Context. +# The default value is: None. + +OUTPUT_TEXT_DIRECTION = None + # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. @@ -179,6 +187,16 @@ SHORT_NAMES = NO JAVADOC_AUTOBRIEF = NO +# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line +# such as +# /*************** +# as being the beginning of a Javadoc-style comment "banner". If set to NO, the +# Javadoc-style will behave just like regular comments and it will not be +# interpreted by doxygen. +# The default value is: NO. + +JAVADOC_BANNER = NO + # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus @@ -199,6 +217,14 @@ QT_AUTOBRIEF = NO MULTILINE_CPP_IS_BRIEF = NO +# By default Python docstrings are displayed as preformatted text and doxygen's +# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the +# doxygen's special commands can be used and the contents of the docstring +# documentation blocks is shown as doxygen documentation. +# The default value is: YES. + +PYTHON_DOCSTRING = YES + # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. @@ -226,16 +252,15 @@ TAB_SIZE = 8 # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert -# newlines. +# newlines (in the resulting output). You can put ^^ in the value part of an +# alias to insert a newline as if a physical newline was in the original file. +# When you need a literal { or } or , in the value part of an alias you have to +# escape them by means of a backslash (\), this can lead to conflicts with the +# commands \{ and \} for these it is advised to use the version @{ and @} or use +# a double escape (\\{ and \\}) ALIASES = -# This tag can be used to specify a number of word-keyword mappings (TCL only). -# A mapping has the form "name=value". For example adding "class=itcl::class" -# will allow you to use the command class in the itcl::class meaning. - -TCL_SUBST = - # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all @@ -264,28 +289,40 @@ OPTIMIZE_FOR_FORTRAN = YES OPTIMIZE_OUTPUT_VHDL = NO +# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice +# sources only. Doxygen will then generate output that is more tailored for that +# language. For instance, namespaces will be presented as modules, types will be +# separated into more groups, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_SLICE = NO + # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and -# language is one of the parsers supported by doxygen: IDL, Java, Javascript, -# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: -# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: -# Fortran. In the later case the parser tries to guess whether the code is fixed -# or free formatted code, this is the default for Fortran type files), VHDL. For -# instance to make doxygen treat .inc files as Fortran files (default is PHP), -# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# language is one of the parsers supported by doxygen: IDL, Java, JavaScript, +# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL, +# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: +# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser +# tries to guess whether the code is fixed or free formatted code, this is the +# default for Fortran type files). For instance to make doxygen treat .inc files +# as Fortran files (default is PHP), and .f files as C (default is Fortran), +# use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise -# the files are not read by doxygen. +# the files are not read by doxygen. When specifying no_extension you should add +# * to the FILE_PATTERNS. +# +# Note see also the list of default file extension mappings. EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable -# documentation. See http://daringfireball.net/projects/markdown/ for details. +# documentation. See https://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. @@ -293,6 +330,15 @@ EXTENSION_MAPPING = MARKDOWN_SUPPORT = YES +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up +# to that level are automatically included in the table of contents, even if +# they do not have an id attribute. +# Note: This feature currently applies only to Markdown headings. +# Minimum value: 0, maximum value: 99, default value: 5. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +TOC_INCLUDE_HEADINGS = 5 + # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or @@ -318,7 +364,7 @@ BUILTIN_STL_SUPPORT = NO CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: -# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. @@ -341,7 +387,7 @@ IDL_PROPERTY_SUPPORT = YES # all members of a group must be documented explicitly. # The default value is: NO. -DISTRIBUTE_GROUP_DOC = YES +DISTRIBUTE_GROUP_DOC = NO # If one adds a struct or class to a group and this option is enabled, then also # any nested class or struct is added to the same group. By default this option @@ -404,6 +450,19 @@ TYPEDEF_HIDES_STRUCT = NO LOOKUP_CACHE_SIZE = 0 +# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use +# during processing. When set to 0 doxygen will based this on the number of +# cores available in the system. You can set it explicitly to a value larger +# than 0 to get more control over the balance between CPU load and processing +# speed. At this moment only the input processing can be done using multiple +# threads. Since this is still an experimental feature the default is set to 1, +# which efficively disables parallel processing. Please report any issues you +# encounter. Generating dot graphs in parallel is controlled by the +# DOT_NUM_THREADS setting. +# Minimum value: 0, maximum value: 32, default value: 1. + +NUM_PROC_THREADS = 1 + #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- @@ -424,6 +483,12 @@ EXTRACT_ALL = YES EXTRACT_PRIVATE = NO +# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual +# methods of a class will be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIV_VIRTUAL = NO + # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. @@ -461,6 +526,13 @@ EXTRACT_LOCAL_METHODS = NO EXTRACT_ANON_NSPACES = NO +# If this flag is set to YES, the name of an unnamed parameter in a declaration +# will be determined by the corresponding definition. By default unnamed +# parameters remain unnamed in the output. +# The default value is: YES. + +RESOLVE_UNNAMED_PARAMS = YES + # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation @@ -478,8 +550,8 @@ HIDE_UNDOC_MEMBERS = NO HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend -# (class|struct|union) declarations. If set to NO, these declarations will be -# included in the documentation. +# declarations. If set to NO, these declarations will be included in the +# documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO @@ -498,11 +570,18 @@ HIDE_IN_BODY_DOCS = NO INTERNAL_DOCS = NO -# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file -# names in lower-case letters. If set to YES, upper-case letters are also -# allowed. This is useful if you have classes or files whose names only differ -# in case and if your file system supports case sensitive file names. Windows -# and Mac users are advised to set this option to NO. +# With the correct setting of option CASE_SENSE_NAMES doxygen will better be +# able to match the capabilities of the underlying filesystem. In case the +# filesystem is case sensitive (i.e. it supports files in the same directory +# whose names only differ in casing), the option must be set to YES to properly +# deal with such files in case they appear in the input. For filesystems that +# are not case sensitive the option should be be set to NO to properly deal with +# output files written for symbols that only differ in casing, such as for two +# classes, one named CLASS and the other named Class, and to also support +# references to files without having to specify the exact matching casing. On +# Windows (including Cygwin) and MacOS, users should typically set this option +# to NO, whereas on Linux or other Unix flavors it should typically be set to +# YES. # The default value is: system dependent. CASE_SENSE_NAMES = NO @@ -684,12 +763,12 @@ FILE_VERSION_FILTER = # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. -LAYOUT_FILE = +LAYOUT_FILE = DOCS/DoxygenLayout.xml # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool -# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. +# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. @@ -705,7 +784,7 @@ CITE_BIB_FILES = # messages are off. # The default value is: NO. -QUIET = YES +QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES @@ -734,10 +813,20 @@ WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete -# parameter documentation, but not about the absence of documentation. +# parameter documentation, but not about the absence of documentation. If +# EXTRACT_ALL is set to YES then this flag will automatically be disabled. # The default value is: NO. -WARN_NO_PARAMDOC = NO +WARN_NO_PARAMDOC = YES + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS +# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but +# at the end of the doxygen process doxygen will return with a non-zero status. +# Possible values are: NO, YES and FAIL_ON_WARNINGS. +# The default value is: NO. + +WARN_AS_ERROR = NO # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which @@ -753,7 +842,7 @@ WARN_FORMAT = "$file:$line: $text" # messages should be written. If left blank the output is written to standard # error (stderr). -WARN_LOGFILE = output_err +WARN_LOGFILE = doxygen_error #--------------------------------------------------------------------------- # Configuration options related to the input files @@ -762,17 +851,18 @@ WARN_LOGFILE = output_err # The INPUT tag is used to specify the files and/or directories that contain # documented source files. You may enter file names like myfile.cpp or # directories like /usr/src/myproject. Separate the files or directories with -# spaces. +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = . \ - DOCS/groups-usr.dox +INPUT = BLAS CBLAS SRC INSTALL TESTING \ + DOCS/groups-usr.dox \ + README.md # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv -# documentation (see: http://www.gnu.org/software/libiconv) for the list of -# possible encodings. +# documentation (see: +# https://www.gnu.org/software/libiconv/) for the list of possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 @@ -785,14 +875,19 @@ INPUT_ENCODING = UTF-8 # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # +# Note the list of default checked file patterns might differ from the list of +# default file extension mappings. +# # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, -# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, -# *.vhdl, *.ucf, *.qsf, *.as and *.js. +# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment), +# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl, +# *.ucf, *.qsf and *.ice. FILE_PATTERNS = *.c \ *.f \ + *.f90 \ *.h # The RECURSIVE tag can be used to specify whether or not subdirectories should @@ -808,34 +903,15 @@ RECURSIVE = YES # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = CMAKE \ - DOCS \ - .svn \ - CBLAS/.svn \ - CBLAS/src/.svn \ - CBLAS/testing/.svn \ - CBLAS/example/.svn \ - CBLAS/include/.svn \ - BLAS/.svn \ - BLAS/SRC/.svn \ - BLAS/TESTING/.svn \ - SRC/.svn \ - SRC/VARIANTS/.svn \ - SRC/VARIANTS/LIB/.svn \ - SRC/VARIANTS/cholesky/.svn \ - SRC/VARIANTS/cholesky/RL/.svn \ - SRC/VARIANTS/cholesky/TOP/.svn \ - SRC/VARIANTS/lu/.svn \ - SRC/VARIANTS/lu/CR/.svn \ - SRC/VARIANTS/lu/LL/.svn \ - SRC/VARIANTS/lu/REC/.svn \ - SRC/VARIANTS/qr/.svn \ - SRC/VARIANTS/qr/LL/.svn \ - INSTALL/.svn \ - TESTING/.svn \ - TESTING/EIG/.svn \ - TESTING/MATGEN/.svn \ - TESTING/LIN/.svn +# Exclude functions that are duplicated, creating conflicts. +EXCLUDE = .git \ + .github \ + SRC/VARIANTS \ + BLAS/SRC/lsame.f \ + BLAS/SRC/xerbla.f \ + BLAS/SRC/xerbla_array.f \ + INSTALL/slamchf77.f \ + INSTALL/dlamchf77.f \ # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -908,6 +984,10 @@ IMAGE_PATH = # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. INPUT_FILTER = @@ -917,6 +997,10 @@ INPUT_FILTER = # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. FILTER_PATTERNS = @@ -969,7 +1053,7 @@ INLINE_SOURCES = YES STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented -# function all documented functions referencing it will be listed. +# entity all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO @@ -1001,12 +1085,12 @@ SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system -# (see http://www.gnu.org/software/global/global.html). You will need version +# (see https://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global -# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # @@ -1028,25 +1112,6 @@ USE_HTAGS = NO VERBATIM_HEADERS = YES -# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the -# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the -# cost of reduced performance. This can be particularly helpful with template -# rich C++ code for which doxygen's built-in parser lacks the necessary type -# information. -# Note: The availability of this option depends on whether or not doxygen was -# compiled with the --with-libclang option. -# The default value is: NO. - -CLANG_ASSISTED_PARSING = NO - -# If clang assisted parsing is enabled you can provide the compiler with command -# line options that you would normally use when invoking the compiler. Note that -# the include paths will already be set by doxygen for the files and directories -# specified with INPUT and INCLUDE_PATH. -# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. - -CLANG_OPTIONS = - #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- @@ -1058,13 +1123,6 @@ CLANG_OPTIONS = ALPHABETICAL_INDEX = YES -# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in -# which the alphabetical index list will be split. -# Minimum value: 1, maximum value: 20, default value: 5. -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - -COLS_IN_ALPHA_INDEX = 5 - # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag # can be used to specify a prefix (or a list of prefixes) that should be ignored @@ -1165,7 +1223,7 @@ HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see -# http://en.wikipedia.org/wiki/Hue for more information. For instance the value +# https://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. @@ -1201,6 +1259,17 @@ HTML_COLORSTYLE_GAMMA = 80 HTML_TIMESTAMP = YES +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML +# documentation will contain a main index with vertical navigation menus that +# are dynamically created via JavaScript. If disabled, the navigation index will +# consists of multiple levels of tabs that are statically embedded in every HTML +# page. Disable this option to support browsers that do not have JavaScript, +# like the Qt help browser. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_MENUS = YES + # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. @@ -1224,13 +1293,14 @@ HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development -# environment (see: http://developer.apple.com/tools/xcode/), introduced with -# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a -# Makefile in the HTML output directory. Running make will produce the docset in -# that directory and running make install will install the docset in +# environment (see: +# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To +# create a documentation set, doxygen will generate a Makefile in the HTML +# output directory. Running make will produce the docset in that directory and +# running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at -# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html -# for more information. +# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy +# genXcode/_index.html for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. @@ -1269,8 +1339,8 @@ DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop -# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on -# Windows. +# (see: +# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML @@ -1300,7 +1370,7 @@ CHM_FILE = HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated -# (YES) or that it should be included in the master .chm file (NO). +# (YES) or that it should be included in the main .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. @@ -1345,7 +1415,8 @@ QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace -# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). +# (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1353,8 +1424,8 @@ QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual -# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- -# folders). +# Folders (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1362,30 +1433,30 @@ QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- -# filters). +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- -# filters). +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: -# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = -# The QHG_LOCATION tag can be used to specify the location of Qt's -# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the -# generated .qhp file. +# The QHG_LOCATION tag can be used to specify the location (absolute path +# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to +# run qhelpgenerator on the generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = @@ -1462,6 +1533,17 @@ TREEVIEW_WIDTH = 250 EXT_LINKS_IN_WINDOW = NO +# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg +# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see +# https://inkscape.org) to generate formulas as SVG images instead of PNGs for +# the HTML output. These images will generally look nicer at scaled resolutions. +# Possible values are: png (the default) and svg (looks nicer but requires the +# pdf2svg or inkscape tool). +# The default value is: png. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FORMULA_FORMAT = png + # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML @@ -1471,7 +1553,7 @@ EXT_LINKS_IN_WINDOW = NO FORMULA_FONTSIZE = 10 -# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# Use the FORMULA_TRANSPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # @@ -1482,8 +1564,14 @@ FORMULA_FONTSIZE = 10 FORMULA_TRANSPARENT = YES +# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands +# to create new LaTeX commands to be used in formulas as building blocks. See +# the section "Including formulas" for details. + +FORMULA_MACROFILE = + # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see -# http://www.mathjax.org) which uses client side Javascript for the rendering +# https://www.mathjax.org) which uses client side JavaScript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path @@ -1495,7 +1583,7 @@ USE_MATHJAX = NO # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: -# http://docs.mathjax.org/en/latest/output.html) for more details. +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. @@ -1510,8 +1598,8 @@ MATHJAX_FORMAT = HTML-CSS # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of -# MathJax from http://www.mathjax.org before deployment. -# The default value is: http://cdn.mathjax.org/mathjax/latest. +# MathJax from https://www.mathjax.org before deployment. +# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://www.mathjax.org/mathjax @@ -1525,7 +1613,8 @@ MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site -# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# (see: +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. @@ -1553,7 +1642,7 @@ MATHJAX_CODEFILE = SEARCHENGINE = YES # When the SERVER_BASED_SEARCH tag is enabled the search engine will be -# implemented using a web server instead of a web client using Javascript. There +# implemented using a web server instead of a web client using JavaScript. There # are two flavors of web server based searching depending on the EXTERNAL_SEARCH # setting. When disabled, doxygen will generate a PHP script for searching and # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing @@ -1572,7 +1661,8 @@ SERVER_BASED_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: http://xapian.org/). +# Xapian (see: +# https://xapian.org/). # # See the section "External Indexing and Searching" for details. # The default value is: NO. @@ -1585,8 +1675,9 @@ EXTERNAL_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: http://xapian.org/). See the section "External Indexing and -# Searching" for details. +# Xapian (see: +# https://xapian.org/). See the section "External Indexing and Searching" for +# details. # This tag requires that the tag SEARCHENGINE is set to YES. SEARCHENGINE_URL = @@ -1637,21 +1728,35 @@ LATEX_OUTPUT = latex # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be # invoked. # -# Note that when enabling USE_PDFLATEX this option is only used for generating -# bitmaps for formulas in the HTML output, but not in the Makefile that is -# written to the output directory. -# The default file is: latex. +# Note that when not enabling USE_PDFLATEX the default is latex when enabling +# USE_PDFLATEX the default is pdflatex and when in the later case latex is +# chosen this is overwritten by pdflatex. For specific output languages the +# default can have been set differently, this depends on the implementation of +# the output language. # This tag requires that the tag GENERATE_LATEX is set to YES. LATEX_CMD_NAME = latex # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate # index for LaTeX. +# Note: This tag is used in the Makefile / make.bat. +# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file +# (.tex). # The default file is: makeindex. # This tag requires that the tag GENERATE_LATEX is set to YES. MAKEINDEX_CMD_NAME = makeindex +# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to +# generate index for LaTeX. In case there is no backslash (\) as first character +# it will be automatically added in the LaTeX code. +# Note: This tag is used in the generated output file (.tex). +# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat. +# The default value is: makeindex. +# This tag requires that the tag GENERATE_LATEX is set to YES. + +LATEX_MAKEINDEX_CMD = makeindex + # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX # documents. This may be useful for small projects and may help to save some # trees in general. @@ -1736,9 +1841,11 @@ LATEX_EXTRA_FILES = PDF_HYPERLINKS = YES -# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate -# the PDF file directly from the LaTeX files. Set this option to YES, to get a -# higher quality PDF documentation. +# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as +# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX +# files. Set this option to YES, to get a higher quality PDF documentation. +# +# See also section LATEX_CMD_NAME for selecting the engine. # The default value is: YES. # This tag requires that the tag GENERATE_LATEX is set to YES. @@ -1772,12 +1879,28 @@ LATEX_SOURCE_CODE = NO # The LATEX_BIB_STYLE tag can be used to specify the style to use for the # bibliography, e.g. plainnat, or ieeetr. See -# http://en.wikipedia.org/wiki/BibTeX and \cite for more info. +# https://en.wikipedia.org/wiki/BibTeX and \cite for more info. # The default value is: plain. # This tag requires that the tag GENERATE_LATEX is set to YES. LATEX_BIB_STYLE = plain +# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated +# page will contain the date and time when the page was generated. Setting this +# to NO can help when comparing the output of multiple runs. +# The default value is: NO. +# This tag requires that the tag GENERATE_LATEX is set to YES. + +LATEX_TIMESTAMP = NO + +# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute) +# path from which the emoji images will be read. If a relative path is entered, +# it will be relative to the LATEX_OUTPUT directory. If left blank the +# LATEX_OUTPUT directory will be used. +# This tag requires that the tag GENERATE_LATEX is set to YES. + +LATEX_EMOJI_DIRECTORY = + #--------------------------------------------------------------------------- # Configuration options related to the RTF output #--------------------------------------------------------------------------- @@ -1817,9 +1940,9 @@ COMPACT_RTF = NO RTF_HYPERLINKS = YES -# Load stylesheet definitions from file. Syntax is similar to doxygen's config -# file, i.e. a series of assignments. You only have to provide replacements, -# missing definitions are set to their default value. +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# configuration file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. # # See also section "Doxygen usage" for information on how to generate the # default style sheet that doxygen normally uses. @@ -1828,8 +1951,8 @@ RTF_HYPERLINKS = YES RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an RTF document. Syntax is -# similar to doxygen's config file. A template extensions file can be generated -# using doxygen -e rtf extensionFile. +# similar to doxygen's configuration file. A template extensions file can be +# generated using doxygen -e rtf extensionFile. # This tag requires that the tag GENERATE_RTF is set to YES. RTF_EXTENSIONS_FILE = @@ -1915,6 +2038,13 @@ XML_OUTPUT = xml XML_PROGRAMLISTING = YES +# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include +# namespace members in file scope as well, matching the HTML output. +# The default value is: NO. +# This tag requires that the tag GENERATE_XML is set to YES. + +XML_NS_MEMB_FILE_SCOPE = NO + #--------------------------------------------------------------------------- # Configuration options related to the DOCBOOK output #--------------------------------------------------------------------------- @@ -1947,9 +2077,9 @@ DOCBOOK_PROGRAMLISTING = NO #--------------------------------------------------------------------------- # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an -# AutoGen Definitions (see http://autogen.sf.net) file that captures the -# structure of the code including all documentation. Note that this feature is -# still experimental and incomplete at the moment. +# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures +# the structure of the code including all documentation. Note that this feature +# is still experimental and incomplete at the moment. # The default value is: NO. GENERATE_AUTOGEN_DEF = NO @@ -2116,12 +2246,6 @@ EXTERNAL_GROUPS = YES EXTERNAL_PAGES = YES -# The PERL_PATH should be the absolute path and name of the perl script -# interpreter (i.e. the result of 'which perl'). -# The default file (with absolute path) is: /usr/bin/perl. - -PERL_PATH = /sw/bin/perl - #--------------------------------------------------------------------------- # Configuration options related to the dot tool #--------------------------------------------------------------------------- @@ -2135,15 +2259,6 @@ PERL_PATH = /sw/bin/perl CLASS_DIAGRAMS = YES -# You can define message sequence charts within doxygen comments using the \msc -# command. Doxygen will then run the mscgen tool (see: -# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the -# documentation. The MSCGEN_PATH tag allows you to specify the directory where -# the mscgen tool resides. If left empty the tool is assumed to be found in the -# default search path. - -MSCGEN_PATH = - # You can include diagrams made with dia in doxygen documentation. Doxygen will # then run dia to produce the diagram and insert it in the documentation. The # DIA_PATH tag allows you to specify the directory where the dia binary resides. @@ -2241,10 +2356,32 @@ UML_LOOK = NO # but if the number exceeds 15, the total amount of fields shown is limited to # 10. # Minimum value: 0, maximum value: 100, default value: 10. -# This tag requires that the tag HAVE_DOT is set to YES. +# This tag requires that the tag UML_LOOK is set to YES. UML_LIMIT_NUM_FIELDS = 10 +# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and +# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS +# tag is set to YES, doxygen will add type and arguments for attributes and +# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen +# will not generate fields with class member information in the UML graphs. The +# class diagrams will look similar to the default class diagrams but using UML +# notation for the relationships. +# Possible values are: NO, YES and NONE. +# The default value is: NO. +# This tag requires that the tag UML_LOOK is set to YES. + +DOT_UML_DETAILS = NO + +# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters +# to display on a single line. If the actual line length exceeds this threshold +# significantly it will wrapped across multiple lines. Some heuristics are apply +# to avoid ugly line breaks. +# Minimum value: 0, maximum value: 1000, default value: 17. +# This tag requires that the tag HAVE_DOT is set to YES. + +DOT_WRAP_THRESHOLD = 17 + # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and # collaboration graphs will show the relations between templates and their # instances. @@ -2371,6 +2508,11 @@ DIAFILE_DIRS = PLANTUML_JAR_PATH = +# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a +# configuration file for plantuml. + +PLANTUML_CFG_FILE = + # When using plantuml, the specified paths are searched for files specified by # the !include statement in a plantuml block. @@ -2429,9 +2571,11 @@ DOT_MULTI_TARGETS = NO GENERATE_LEGEND = YES -# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot +# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate # files that are used to generate the various graphs. +# +# Note: This setting is not only used for dot files but also for msc and +# plantuml temporary files. # The default value is: YES. -# This tag requires that the tag HAVE_DOT is set to YES. DOT_CLEANUP = YES diff --git a/lapack-netlib/DOCS/DoxygenLayout.xml b/lapack-netlib/DOCS/DoxygenLayout.xml new file mode 100644 index 000000000..aeb346d8d --- /dev/null +++ b/lapack-netlib/DOCS/DoxygenLayout.xml @@ -0,0 +1,197 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lapack-netlib/DOCS/groups-usr.dox b/lapack-netlib/DOCS/groups-usr.dox index 6c31e4cf5..cbd747165 100644 --- a/lapack-netlib/DOCS/groups-usr.dox +++ b/lapack-netlib/DOCS/groups-usr.dox @@ -1,605 +1,974 @@ /** - * - **** - * - * @defgroup lapack LAPACK - * This is the group of LAPACK routines. - * - * @defgroup GE General Matrices - * @ingroup lapack - * This is the group of General Matrices routines - * @defgroup GB General Band Matrix - * @ingroup lapack - * This is the group of General Band routines - * @defgroup SY Symmetric Matrix - * @ingroup lapack - * This is the group of Symmetric Matrix routines - * @defgroup HE Hermitian Matrix - * @ingroup lapack - * This is the group of Hermitian Matrix routines - * @defgroup PO Positive Definite Matrix - * @ingroup lapack - * This is the group of Positive Definite routines - * @defgroup GT General tridiagonal Matrix - * @ingroup lapack - * This is the group of General tridiagonal routines - * @defgroup PT Positive Definite tridiagonal Matrix - * @ingroup lapack - * This is the group of Positive Definite tridiagonal routines - * @defgroup OTHEReigen Eigenvalue - * @ingroup lapack - * This is the group of Other Eigenvalue routines - * @defgroup OTHERauxiliary Other Auxiliary Routines - * @ingroup lapack - * This is the group of Other Auxiliary routines - * @defgroup OTHERcomputational Other Computational Routines - * @ingroup lapack - * This is the group of Other Computational routines - * @defgroup OTHERsolve Other Solve Routines - * @ingroup lapack - * This is the group of Other Solve routines - * - **** - * - * @defgroup solveGE Linear Solve - * @ingroup GE - * This is the group of Linear Solve Driver routines - * @defgroup solveGB Linear Solve - * @ingroup GB - * This is the group of Linear Solve Driver routines - * @defgroup solveSY Linear Solve - * @ingroup SY - * This is the group of Linear Solve Driver routines - * @defgroup solveHE Linear Solve - * @ingroup HE - * This is the group of Linear Solve Driver routines - * @defgroup solvePO Linear Solve - * @ingroup PO - * This is the group of Linear Solve Driver routines - * @defgroup solveGT Linear Solve - * @ingroup GT - * This is the group of Linear Solve Driver routines - * @defgroup solvePT Linear Solve - * @ingroup PT - * This is the group of Linear Solve Driver routines - * @defgroup eigenGE Eigenvalue - * @ingroup GE - * This is the group of Eigenvalue Driver routines - * @defgroup eigenSY Eigenvalue - * @ingroup SY - * This is the group of Eigenvalue Driver routines - * @defgroup eigenHE Eigenvalue - * @ingroup HE - * This is the group of Eigenvalue Driver routines - * @defgroup singGE Singular Value - * @ingroup GE - * This is the group of Singular Value Driver routines - * @defgroup computationalGE Computational routines - * @ingroup GE - * This is the group of Computational routines - * @defgroup variantsGEcomputational Variants Computational routines - * @ingroup GE - * This is the group of Variants Computational routines - * @defgroup computationalGB Computational routines - * @ingroup GB - * This is the group of Computational routines - * @defgroup computationalSY Computational routines - * @ingroup SY - * This is the group of Computational routines - * @defgroup computationalHE Computational routines - * @ingroup HE - * This is the group of Computational routines - * @defgroup computationalPO Computational routines - * @ingroup PO - * This is the group of Computational routines - * @defgroup variantsPOcomputational Variants Computational routines - * @ingroup PO - * This is the group of Variants Computational routines - * @defgroup computationalGT Computational routines - * @ingroup GT - * This is the group of Computational routines - * @defgroup computationalPT Computational routines - * @ingroup PT - * This is the group of Computational routines - * @defgroup variantsOTHERcomputational Variants Computational routines - * @ingroup OTHERcomputational - * This is the group of Variants Computational routines - * @defgroup auxiliaryGE Auxiliary routines - * @ingroup GE - * This is the group of Auxiliary routines - * @defgroup auxiliaryGB Auxiliary routines - * @ingroup GB - * This is the group of Auxiliary routines - * @defgroup auxiliarySY Auxiliary routines - * @ingroup SY - * This is the group of Auxiliary routines - * @defgroup auxiliaryHE Auxiliary routines - * @ingroup HE - * This is the group of Auxiliary routines - * @defgroup auxiliaryPO Auxiliary routines - * @ingroup PO - * This is the group of Auxiliary routines - * @defgroup auxiliaryGT Auxiliary routines - * @ingroup GT - * This is the group of Auxiliary routines - * @defgroup auxiliaryPT Auxiliary routines - * @ingroup PT - * This is the group of Auxiliary routines - * - **** - * - * @defgroup doubleGEsolve double - * @ingroup solveGE - * This is the group of double solve driver functions for GE matrices - * @defgroup doublePOsolve double - * @ingroup solvePO - * This is the group of double solve driver functions for PO matrices - * @defgroup doubleSYsolve double - * @ingroup solveSY - * This is the group of double solve driver functions for SY matrices - * @defgroup doubleGBsolve double - * @ingroup solveGB - * This is the group of double solve driver functions for GB matrices - * @defgroup doubleGTsolve double - * @ingroup solveGT - * This is the group of double solve driver functions for GT matrices - * @defgroup doublePTsolve double - * @ingroup solvePT - * This is the group of double solve driver functions for PT matrices - * @defgroup doubleGEeigen double - * @ingroup eigenGE - * This is the group of double eigenvalue driver functions for GE matrices - * @defgroup doubleSYeigen double - * @ingroup eigenSY - * This is the group of double eigenvalue driver functions for SY matrices - * @defgroup doubleGEsing double - * @ingroup singGE - * This is the group of double singular value driver functions for GE matrices - * @defgroup doubleGEcomputational double - * @ingroup computationalGE - * This is the group of double computational functions for GE matrices - * @defgroup doublePOcomputational double - * @ingroup computationalPO - * This is the group of double computational functions for PO matrices - * @defgroup doubleSYcomputational double - * @ingroup computationalSY - * This is the group of double computational functions for SY matrices - * @defgroup doubleGBcomputational double - * @ingroup computationalGB - * This is the group of double computational functions for GB matrices - * @defgroup doubleGTcomputational double - * @ingroup computationalGT - * This is the group of double computational functions for GT matrices - * @defgroup doublePTcomputational double - * @ingroup computationalPT - * This is the group of double computational functions for PT matrices - * @defgroup doubleGEauxiliary double - * @ingroup auxiliaryGE - * This is the group of double auxiliary functions for GE matrices - * @defgroup doublePOauxiliary double - * @ingroup auxiliaryPO - * This is the group of double auxiliary functions for PO matrices - * @defgroup doubleSYauxiliary double - * @ingroup auxiliarySY - * This is the group of double auxiliary functions for SY matrices - * @defgroup doubleGBauxiliary double - * @ingroup auxiliaryGB - * This is the group of double auxiliary functions for GB matrices - * @defgroup doublePTauxiliary double - * @ingroup auxiliaryPT - * This is the group of double auxiliary functions for PT matrices - * @defgroup doubleGTauxiliary double - * @ingroup auxiliaryGT - * This is the group of double auxiliary functions for GT matrices - * @defgroup doubleOTHERauxiliary double - * @ingroup OTHERauxiliary - * This is the group of double other auxiliary routines - * @defgroup doubleOTHERcomputational double - * @ingroup OTHERcomputational - * This is the group of double other Computational routines - * @defgroup doubleOTHERsolve double Other Solve Routines - * @ingroup OTHERsolve - * This is the group of double Other Solve routines - * @defgroup doubleOTHEReigen double - * @ingroup OTHEReigen - * This is the group of double Other Eigenvalue routines - * - **** - * - * @defgroup realGEsolve real - * @ingroup solveGE - * This is the group of real solve driver functions for GE matrices - * @defgroup realPOsolve real - * @ingroup solvePO - * This is the group of real solve driver functions for PO matrices - * @defgroup realSYsolve real - * @ingroup solveSY - * This is the group of real solve driver functions for SY matrices - * @defgroup realGBsolve real - * @ingroup solveGB - * This is the group of real solve driver functions for GB matrices - * @defgroup realGTsolve real - * @ingroup solveGT - * This is the group of real solve driver functions for GT matrices - * @defgroup realPTsolve real - * @ingroup solvePT - * This is the group of real solve driver functions for PT matrices - * @defgroup realGEeigen real - * @ingroup eigenGE - * This is the group of real eigenvalue driver functions for GE matrices - * @defgroup realSYeigen real - * @ingroup eigenSY - * This is the group of real eigenvalue driver functions for SY matrices - * @defgroup realGEsing real - * @ingroup singGE - * This is the group of real singular value driver functions for GE matrices - * @defgroup realGEcomputational real - * @ingroup computationalGE - * This is the group of real computational functions for GE matrices - * @defgroup realPOcomputational real - * @ingroup computationalPO - * This is the group of real computational functions for PO matrices - * @defgroup realSYcomputational real - * @ingroup computationalSY - * This is the group of real computational functions for SY matrices - * @defgroup realGBcomputational real - * @ingroup computationalGB - * This is the group of real computational functions for GB matrices - * @defgroup realPTcomputational real - * @ingroup computationalPT - * This is the group of real computational functions for PT matrices - * @defgroup realGTcomputational real - * @ingroup computationalGT - * This is the group of real computational functions for GT matrices - * @defgroup realGEauxiliary real - * @ingroup auxiliaryGE - * This is the group of real auxiliary functions for GE matrices - * @defgroup realPOauxiliary real - * @ingroup auxiliaryPO - * This is the group of real auxiliary functions for PO matrices - * @defgroup realSYauxiliary real - * @ingroup auxiliarySY - * This is the group of real auxiliary functions for SY matrices - * @defgroup realGBauxiliary real - * @ingroup auxiliaryGB - * This is the group of real auxiliary functions for GB matrices - * @defgroup realGTauxiliary real - * @ingroup auxiliaryGT - * This is the group of real auxiliary functions for GT matrices - * @defgroup realPTauxiliary real - * @ingroup auxiliaryPT - * This is the group of real auxiliary functions for PT matrices - * @defgroup realOTHERauxiliary real - * @ingroup OTHERauxiliary - * This is the group of real other auxiliary routines - * @defgroup realOTHERcomputational real - * @ingroup OTHERcomputational - * This is the group of real other Computational routines - * @defgroup realOTHERsolve real Other Solve Routines - * @ingroup OTHERsolve - * This is the group of real Other Solve routines - * @defgroup realOTHEReigen real - * @ingroup OTHEReigen - * This is the group of real Other Eigenvalue routines - * - **** - * - * @defgroup complexGEsolve complex - * @ingroup solveGE - * This is the group of complex solve driver functions for GE matrices - * @defgroup complexPOsolve complex - * @ingroup solvePO - * This is the group of complex solve driver functions for PO matrices - * @defgroup complexSYsolve complex - * @ingroup solveSY - * This is the group of complex solve driver functions for SY matrices - * @defgroup complexHEsolve complex - * @ingroup solveHE - * This is the group of complex solve driver functions for HE matrices - * @defgroup complexGBsolve complex - * @ingroup solveGB - * This is the group of complex solve driver functions for GB matrices - * @defgroup complexGTsolve complex - * @ingroup solveGT - * This is the group of complex solve driver functions for GT matrices - * @defgroup complexPTsolve complex - * @ingroup solvePT - * This is the group of complex solve driver functions for PT matrices - * @defgroup complexGEeigen complex - * @ingroup eigenGE - * This is the group of complex eigenvalue driver functions for GE matrices - * @defgroup complexSYeigen complex - * @ingroup eigenSY - * This is the group of complex eigenvalue driver functions for SY matrices - * @defgroup complexHEeigen complex - * @ingroup eigenHE - * This is the group of complex eigenvalue driver functions for HE matrices - * @defgroup complexGEsing complex - * @ingroup singGE - * This is the group of complex singular value driver functions for GE matrices - * @defgroup complexGEcomputational complex - * @ingroup computationalGE - * This is the group of complex computational functions for GE matrices - * @defgroup complexPOcomputational complex - * @ingroup computationalPO - * This is the group of complex computational functions for PO matrices - * @defgroup complexSYcomputational complex - * @ingroup computationalSY - * This is the group of complex computational functions for SY matrices - * @defgroup complexHEcomputational complex - * @ingroup computationalHE - * This is the group of complex computational functions for HE matrices - * @defgroup complexGBcomputational complex - * @ingroup computationalGB - * This is the group of complex computational functions for GB matrices - * @defgroup complexGTcomputational complex - * @ingroup computationalGT - * This is the group of complex computational functions for GT matrices - * @defgroup complexPTcomputational complex - * @ingroup computationalPT - * This is the group of complex computational functions for PT matrices - * @defgroup complexGEauxiliary complex - * @ingroup auxiliaryGE - * This is the group of complex auxiliary functions for GE matrices - * @defgroup complexPOauxiliary complex - * @ingroup auxiliaryPO - * This is the group of complex auxiliary functions for PO matrices - * @defgroup complexSYauxiliary complex - * @ingroup auxiliarySY - * This is the group of complex auxiliary functions for SY matrices - * @defgroup complexHEauxiliary complex - * @ingroup auxiliaryHE - * This is the group of complex auxiliary functions for HE matrices - * @defgroup complexGBauxiliary complex - * @ingroup auxiliaryGB - * This is the group of complex auxiliary functions for GB matrices - * @defgroup complexOTHERauxiliary complex - * @ingroup OTHERauxiliary - * This is the group of complex other auxiliary routines - * @defgroup complexOTHERcomputational complex - * @ingroup OTHERcomputational - * This is the group of complex other Computational routines - * @defgroup complexOTHERsolve complex Other Solve Routines - * @ingroup OTHERsolve - * This is the group of complex Other Solve routines - * @defgroup complexOTHEReigen complex Other Eigenvalue routines - * @ingroup OTHEReigen - * This is the group of complex Other Eigenvalue routines - * - **** - * - * @defgroup complex16GEsolve complex16 - * @ingroup solveGE - * This is the group of complex16 solve driver functions for GE matrices - * @defgroup complex16POsolve complex16 - * @ingroup solvePO - * This is the group of complex16 solve driver functions for PO matrices - * @defgroup complex16SYsolve complex16 - * @ingroup solveSY - * This is the group of complex16 solve driver functions for SY matrices - * @defgroup complex16HEsolve complex16 - * @ingroup solveHE - * This is the group of complex16 solve driver functions for HE matrices - * @defgroup complex16GBsolve complex16 - * @ingroup solveGB - * This is the group of complex16 solve driver functions for GB matrices - * @defgroup complex16GTsolve complex16 - * @ingroup solveGT - * This is the group of complex16 solve driver functions for GT matrices - * @defgroup complex16PTsolve complex16 - * @ingroup solvePT - * This is the group of complex16 solve driver functions for PT matrices - * @defgroup complex16GEeigen complex16 - * @ingroup eigenGE - * This is the group of complex16 eigenvalue driver functions for GE matrices - * @defgroup complex16SYeigen complex16 - * @ingroup eigenSY - * This is the group of complex16 eigenvalue driver functions for SY matrices - * @defgroup complex16HEeigen complex16 - * @ingroup eigenHE - * This is the group of complex16 eigenvalue driver functions for HE matrices - * @defgroup complex16GEsing complex16 - * @ingroup singGE - * This is the group of complex16 singular value driver functions for GE matrices - * @defgroup complex16GEcomputational complex16 - * @ingroup computationalGE - * This is the group of complex16 computational functions for GE matrices - * @defgroup complex16POcomputational complex16 - * @ingroup computationalPO - * This is the group of complex16 computational functions for PO matrices - * @defgroup complex16SYcomputational complex16 - * @ingroup computationalSY - * This is the group of complex16 computational functions for SY matrices - * @defgroup complex16HEcomputational complex16 - * @ingroup computationalHE - * This is the group of complex16 computational functions for HE matrices - * @defgroup complex16GBcomputational complex16 - * @ingroup computationalGB - * This is the group of complex16 computational functions for GB matrices - * @defgroup complex16GTcomputational complex16 - * @ingroup computationalGT - * This is the group of complex16 computational functions for GT matrices - * @defgroup complex16PTcomputational complex16 - * @ingroup computationalPT - * This is the group of complex16 computational functions for PT matrices - * @defgroup complex16GEauxiliary complex16 - * @ingroup auxiliaryGE - * This is the group of complex16 auxiliary functions for GE matrices - * @defgroup complex16POauxiliary complex16 - * @ingroup auxiliaryPO - * This is the group of complex16 auxiliary functions for PO matrices - * @defgroup complex16SYauxiliary complex16 - * @ingroup auxiliarySY - * This is the group of complex16 auxiliary functions for SY matrices - * @defgroup complex16HEauxiliary complex16 - * @ingroup auxiliaryHE - * This is the group of complex16 auxiliary functions for HE matrices - * @defgroup complex16GBauxiliary complex16 - * @ingroup auxiliaryGB - * This is the group of complex16 auxiliary functions for GB matrices - * @defgroup complex16OTHERcomputational complex16 - * @ingroup OTHERcomputational - * This is the group of complex16 other Computational routines - * @defgroup complex16OTHERauxiliary complex16 - * @ingroup OTHERauxiliary - * This is the group of complex16 other auxiliary routines - * @defgroup auxOTHERcomputational auxiliary Computational routines - * @ingroup OTHERcomputational - * This is the group of auxiliary Computational routines - * @defgroup complex16OTHERsolve complex16 Other Solve Routines - * @ingroup OTHERsolve - * This is the group of complex16 Other Solve routines - * @defgroup complex16OTHEReigen complex16 Other Eigenvalue routines - * @ingroup OTHEReigen - * This is the group of complex16 Other Eigenvalue routines - * - **** - * - * @defgroup testing LAPACK Testing - * This is the group of LAPACK TESTING routines. - * - * @defgroup matgen Matrix Generation - * @ingroup testing - * This is the group of LAPACK TESTING MATGEN routines. - * - * @defgroup lin Linear Solve - * @ingroup testing - * This is the group of LAPACK TESTING LIN routines. - * - * @defgroup eig Eigenvalue and Singular value - * @ingroup testing - * This is the group of LAPACK TESTING EIG routines. - * - * @defgroup real_matgen real - * @ingroup matgen - * This is the group of real LAPACK TESTING MATGEN routines. - * - * @defgroup double_matgen double - * @ingroup matgen - * This is the group of double LAPACK TESTING MATGEN routines. - * - * @defgroup complex_matgen complex - * @ingroup matgen - * This is the group of complex LAPACK TESTING MATGEN routines. - * - * @defgroup complex16_matgen complex16 - * @ingroup matgen - * This is the group of complex16 LAPACK TESTING MATGEN routines. - * - * @defgroup aux_matgen aux - * @ingroup matgen - * This is the group of auxiliary LAPACK TESTING MATGEN routines. - * - * @defgroup single_lin real - * @ingroup lin - * This is the group of real LAPACK TESTING LIN routines. - * - * @defgroup double_lin double - * @ingroup lin - * This is the group of double LAPACK TESTING LIN routines. - * - * @defgroup complex_lin complex - * @ingroup lin - * This is the group of complex LAPACK TESTING LIN routines. - * - * @defgroup complex16_lin complex16 - * @ingroup lin - * This is the group of complex16 LAPACK TESTING LIN routines. - * - * @defgroup aux_lin aux - * @ingroup lin - * This is the group of auxiliary LAPACK TESTING LIN routines. - * - * @defgroup single_eig real - * @ingroup eig - * This is the group of real LAPACK TESTING EIG routines. - * - * @defgroup double_eig double - * @ingroup eig - * This is the group of double LAPACK TESTING EIG routines. - * - * @defgroup complex_eig complex - * @ingroup eig - * This is the group of complex LAPACK TESTING EIG routines. - * - * @defgroup complex16_eig complex16 - * @ingroup eig - * This is the group of complex16 LAPACK TESTING EIG routines. - * - * @defgroup aux_eig aux - * @ingroup eig - * This is the group of auxiliary LAPACK TESTING EIG routines. - * - **** - * @defgroup blas Reference BLAS - * This is the group of BLAS routines. - * - * @defgroup level1 Level1 - * @ingroup blas - * This is the group of LEVEL 1 BLAS routines. - * @defgroup level2 Level2 - * @ingroup blas - * This is the group of LEVEL 2 BLAS routines. - * @defgroup level3 Level3 - * @ingroup blas - * This is the group of LEVEL 3 BLAS routines. - * @defgroup aux_blas Auxiliary BLAS - * @ingroup blas - * This is the group of Auxiliary 3 BLAS routines. -* @defgroup blastesting Testing - * @ingroup blas - * This is the group of BLAS TESTING routines. - * - * @defgroup single_blas_level1 real - * @ingroup level1 - * This is the group of real LEVEL 1 BLAS routines. - * @defgroup double_blas_level1 double - * @ingroup level1 - * This is the group of double LEVEL 1 BLAS routines. - * @defgroup complex_blas_level1 complex - * @ingroup level1 - * This is the group of complex LEVEL 1 BLAS routines. - * @defgroup complex16_blas_level1 complex16 - * @ingroup level1 - * This is the group of complex16 LEVEL 1 BLAS routines. - * - * @defgroup single_blas_level2 real - * @ingroup level2 - * This is the group of real LEVEL 2 BLAS routines. - * @defgroup double_blas_level2 double - * @ingroup level2 - * This is the group of double LEVEL 2 BLAS routines. - * @defgroup complex_blas_level2 complex - * @ingroup level2 - * This is the group of complex LEVEL 2 BLAS routines. - * @defgroup complex16_blas_level2 complex16 - * @ingroup level2 - * This is the group of complex16 LEVEL 2 BLAS routines. - * - * @defgroup single_blas_level3 real - * @ingroup level3 - * This is the group of real LEVEL 3 BLAS routines. - * @defgroup double_blas_level3 double - * @ingroup level3 - * This is the group of double LEVEL 3 BLAS routines. - * @defgroup complex_blas_level3 complex - * @ingroup level3 - * This is the group of complex LEVEL 3 BLAS routines. - * @defgroup complex16_blas_level3 complex16 - * @ingroup level3 - * This is the group of complex16 LEVEL 3 BLAS routines. - * - * @defgroup single_blas_testing real - * @ingroup blastesting - * This is the group of real BLAS TESTING routines. - * @defgroup double_blas_testing double - * @ingroup blastesting - * This is the group of double BLAS TESTING routines. - * @defgroup complex_blas_testing complex - * @ingroup blastesting - * This is the group of complex BLAS TESTING routines. - * @defgroup complex16_blas_testing complex16 - * @ingroup blastesting - * This is the group of complex16 BLAS TESTING routines. - * + +@defgroup lapack_top LAPACK +@{ + @defgroup solve_top Linear solve, AX = B + @{ + @defgroup gesv_driver_grp LU: General matrix, driver + @{ + @defgroup gesv_driver --- full --- + @defgroup gesv gesv: factor and solve + @defgroup gesvx gesvx: factor and solve, expert + @defgroup gesvxx gesvxx: factor and solve, extra precise + @defgroup gesv_mixed gesv: factor and solve, mixed precision + + @defgroup gbsv_driver --- banded --- + @defgroup gbsv gbsv: factor and solve + @defgroup gbsvx gbsvx: factor and solve, expert + @defgroup gbsvxx gbsvxx: factor and solve, extra precise + + @defgroup gtsv_driver --- tridiagonal --- + @defgroup gtsv gtsv: factor and solve + @defgroup gtsvx gtsvx: factor and solve, expert + @} + + @defgroup gesv_comp_grp LU: computational routines (factor, cond, etc.) + @{ + @defgroup gesv_comp --- full --- + @defgroup gecon gecon: condition number estimate + @defgroup getrf getrf: triangular factor + @defgroup getrf2 getrf2: triangular factor panel, recursive? + @defgroup getf2 getf2: triangular factor panel, level 2 + @defgroup getrs getrs: triangular solve using factor + @defgroup getri getri: triangular inverse + @defgroup gerfs gerfs: iterative refinement + @defgroup gerfsx gerfsx: iterative refinement, expert + @defgroup geequ geequ: equilibration + @defgroup geequb geequb: equilibration, power of 2 + @defgroup laqge laqge: row/col scale matrix + @defgroup laswp laswp: swap permutation + + @defgroup getc2 getc2: triangular factor, with complete pivoting + @defgroup gesc2 gesc2: triangular solve using factor, with complete pivoting + @defgroup latdf latdf: Dif-estimate with complete pivoting LU, step in tgsen + + @defgroup la_gercond la_gercond: Skeel condition number estimate + @defgroup la_gerpvgrw la_gerpvgrw: reciprocal pivot growth + @defgroup la_gerfsx_extended la_gerfsx_extended: step in gerfsx + + @defgroup gbsv_comp --- banded --- + @defgroup gbcon gbcon: condition number estimate + @defgroup gbtrf gbtrf: triangular factor + @defgroup gbtf2 gbtf2: triangular factor, level 2 + @defgroup gbtrs gbtrs: triangular solve using factor + @defgroup gbrfs gbrfs: iterative refinement + @defgroup gbrfsx gbrfsx: iterative refinement, expert + @defgroup gbequ gbequ: equilibration + @defgroup gbequb gbequb: equilibration, power of 2 + @defgroup laqgb laqgb: row/col scale matrix + @defgroup la_gbrcond la_gbrcond: Skeel condition number estimate + @defgroup la_gbrpvgrw la_gbrpvgrw: reciprocal pivot growth + @defgroup la_gbrfsx_extended la_gbrfsx_extended: step in gbrfsx + + @defgroup gtsv_comp --- tridiagonal --- + @defgroup gtcon gtcon: condition number estimate + @defgroup gttrf gttrf: triangular factor + @defgroup gttrs gttrs: triangular solve using factor + @defgroup gtts2 gtts2: triangular solve using factor + @defgroup gtrfs gtrfs: iterative refinement + @} + + @defgroup posv_driver_grp Cholesky: Hermitian/symmetric positive definite matrix, driver + @{ + @defgroup posv_driver --- full --- + @defgroup posv posv: factor and solve + @defgroup posvx posvx: factor and solve, expert + @defgroup posvxx posvxx: factor and solve, extra precise + @defgroup posv_mixed posv: factor and solve, mixed precision + + @defgroup ppsv_driver --- packed --- + @defgroup ppsv ppsv: factor and solve + @defgroup ppsvx ppsvx: factor and solve, expert + + @defgroup pfsv_driver --- rectangular full packed (RFP) --- + @defgroup pfsv pfsv: factor and solve [not available] + + @defgroup pbsv_driver --- banded --- + @defgroup pbsv pbsv: factor and solve + @defgroup pbsvx pbsvx: factor and solve, expert + + @defgroup ptsv_driver --- tridiagonal --- + @defgroup ptsv ptsv: factor and solve + @defgroup ptsvx ptsvx: factor and solve, expert + @} + + @defgroup posv_comp_grp Cholesky: computational routines (factor, cond, etc.) + @{ + @defgroup posv_comp --- full --- + @defgroup pocon pocon: condition number estimate + @defgroup potrf potrf: triangular factor + @defgroup potrf2 potrf2: triangular factor panel, recursive? + @defgroup potf2 potf2: triangular factor panel, level 2 + @defgroup pstrf pstrf: triangular factor, with pivoting + @defgroup pstf2 pstf2: triangular factor, with pivoting panel, level 2 + @defgroup potrs potrs: triangular solve using factor + @defgroup potri potri: triangular inverse + @defgroup porfs porfs: iterative refinement + @defgroup porfsx porfsx: iterative refinement, expert + @defgroup poequ poequ: equilibration + @defgroup poequb poequb: equilibration, power of 2 + @defgroup laqhe laqhe: row/col scale matrix + @defgroup la_porcond la_porcond: Skeel condition number estimate + @defgroup la_porpvgrw la_porpvgrw: reciprocal pivot growth + @defgroup la_porfsx_extended la_porfsx_extended: step in porfsx + + @defgroup ppsv_comp --- packed --- + @defgroup ppcon ppcon: condition number estimate + @defgroup pptrf pptrf: triangular factor + @defgroup pptrs pptrs: triangular solve using factor + @defgroup pptri pptri: triangular inverse + @defgroup pprfs pprfs: iterative refinement + @defgroup ppequ ppequ: equilibration + @defgroup laqhp laqhp: row/col scale matrix + + @defgroup pfsv_comp --- rectangular full packed (RFP) --- + @defgroup pftrf pftrf: triangular factor + @defgroup pftrs pftrs: triangular solve using factor + @defgroup pftri pftri: triangular inverse + + @defgroup pbsv_comp --- banded --- + @defgroup pbcon pbcon: condition number estimate + @defgroup pbtrf pbtrf: triangular factor + @defgroup pbtf2 pbtf2: triangular factor panel, level 2 + @defgroup pbtrs pbtrs: triangular solve using factor + @defgroup pbrfs pbrfs: iterative refinement + @defgroup pbequ pbequ: equilibration + @defgroup laqhb laqhb: row/col scale matrix + + @defgroup ptsv_comp --- tridiagonal --- + @defgroup ptcon ptcon: condition number estimate + @defgroup pttrf pttrf: triangular factor + @defgroup pttrs pttrs: triangular solve using factor + @defgroup ptts2 ptts2: triangular solve using factor, unblocked + @defgroup ptrfs ptrfs: iterative refinement + @} + + @defgroup hesv_driver_grp LDL: Hermitian/symmetric indefinite matrix, driver + @{ + @defgroup hesv_driver --- full, rook pivoting --- + @defgroup hesv {he,sy}sv: rook (v1) + @defgroup hesv_rook {he,sy}sv_rook: rook (v2) + @defgroup hesv_rk {he,sy}sv_rk: rook (v3) + @defgroup hesvx {he,sy}svx: rook (v1, expert) + @defgroup hesvxx {he,sy}svxx: rook (v1, expert) + + @defgroup hpsv_driver --- packed, rook pivoting --- + @defgroup hpsv {hp,sp}sv: factor and solve + @defgroup hpsvx {hp,sp}svx: factor and solve, expert + + @defgroup hesv_aa_driver --- full, Aasen --- + @defgroup hesv_aa {he,sy}sv_aa: Aasen + @defgroup hesv_aa_2stage {he,sy}sv_aa_2stage: Aasen, blocked 2-stage + @} + + @defgroup hesv_comp_grp LDL: computational routines (factor, cond, etc.) + @{ + @defgroup hesv_comp_v1 --- full, rook v1 --- + @defgroup hecon {he,sy}con: condition number estimate + @defgroup hetrf {he,sy}trf: triangular factor + @defgroup lahef la{he,sy}f: step in hetrf + @defgroup hetf2 {he,sy}tf2: triangular factor, level 2 + @defgroup hetrs {he,sy}trs: triangular solve using factor + @defgroup hetri {he,sy}tri: triangular inverse + @defgroup herfs {he,sy}rfs: iterative refinement + @defgroup herfsx {he,sy}rfsx: iterative refinement, expert + @defgroup heequb {he,sy}equb: equilibration, power of 2 + @defgroup syconv syconv: convert to/from L and D from hetrf + + @defgroup hecon_3 {he,sy}con_3: condition number estimate + @defgroup hetri2 {he,sy}tri2: inverse + @defgroup hetri2x {he,sy}tri2x: inverse + @defgroup hetri_3 {he,sy}tri_3: inverse + @defgroup hetri_3x {he,sy}tri_3x: inverse + @defgroup hetrs2 {he,sy}trs2: solve using factor + @defgroup hetrs_3 {he,sy}trs_3: solve using factor + + @defgroup heswapr {he,sy}swapr: apply 2-sided permutation + @defgroup la_hercond la_hercond: Skeel condition number estimate + @defgroup la_herfsx_extended la_herfsx_extended: step in herfsx + @defgroup la_herpvgrw la_herpvgrw: reciprocal pivot growth + + @defgroup hpsv_comp --- packed, rook v1 --- + @defgroup hpcon {hp,sp}con: condition number estimate + @defgroup hptrf {hp,sp}trf: triangular factor + @defgroup hptrs {hp,sp}trs: triangular solve using factor + @defgroup hptri {hp,sp}tri: triangular inverse + @defgroup hprfs {hp,sp}rfs: iterative refinement + + @defgroup hesv_comp_v2 --- full, rook v2 --- + @defgroup hecon_rook {he,sy}con_rook: condition number estimate + @defgroup hetrf_rook {he,sy}trf_rook: triangular factor + @defgroup lahef_rook la{he,sy}f_rook: triangular factor step + @defgroup hetf2_rook {he,sy}tf2_rook: triangular factor, level 2 + @defgroup hetrs_rook {he,sy}trs_rook: triangular solve using factor + @defgroup hetri_rook {he,sy}tri_rook: triangular inverse + + @defgroup hesv_comp_v3 --- full, rook v3 --- + @defgroup hetrf_rk {he,sy}trf_rk: triangular factor + @defgroup lahef_rk la{he,sy}f_rk: triangular factor step + @defgroup hetf2_rk {he,sy}tf2_rk: triangular factor, level 2 + @defgroup syconvf syconvf: convert to/from hetrf to hetrf_rk format + @defgroup syconvf_rook syconvf_rook: convert to/from hetrf_rook to hetrf_rk format + + @defgroup hesv_comp_aasen --- full, Aasen --- + @defgroup hetrf_aa {he,sy}trf_aa: triangular factor + @defgroup lahef_aa la{he,sy}f_aa: triangular factor partial factor + @defgroup hetrs_aa {he,sy}trs_aa: triangular solve using factor + + @defgroup hesv_comp_aasen2 --- full, Aasen, blocked 2-stage --- + @defgroup hetrf_aa_2stage {he,sy}trf_aa_2stage: triangular factor + @defgroup hetrs_aa_2stage {he,sy}trs_aa_2stage: triangular solve using factor + @} + + @defgroup trsv_comp_grp Triangular computational routines (solve, cond, etc.) + @{ + @defgroup trsv_comp --- full --- + @defgroup trcon trcon: condition number estimate + @defgroup trtrs trtrs: triangular solve + @defgroup latrs latrs: triangular solve with robust scaling + @defgroup latrs3 latrs3: triangular solve with robust scaling, level 3 + @defgroup trtri trtri: triangular inverse + @defgroup trti2 trti2: triangular inverse, level 2 + @defgroup trrfs trrfs: triangular iterative refinement + @defgroup lauum lauum: triangular multiply: U^H U + @defgroup lauu2 lauu2: triangular multiply: U^H U, level 2 + + @defgroup tpsv_comp --- packed --- + @defgroup tpcon tpcon: condition number estimate + @defgroup tptrs tptrs: triangular solve + @defgroup latps latps: triangular solve with robust scaling + @defgroup tptri tptri: triangular inverse + @defgroup tprfs tprfs: triangular iterative refinement + + @defgroup tfsv_comp --- rectangular full packed (RFP) --- + @defgroup tftri tftri: triangular inverse, RFP + + @defgroup tbsv_comp --- banded --- + @defgroup tbcon tbcon: condition number estimate + @defgroup tbtrs tbtrs: triangular solve + @defgroup latbs latbs: triangular solve with scaling + @defgroup tbrfs tbrfs: triangular iterative refinement + @} + + @defgroup solve_aux_grp Auxiliary routines + @{ + @defgroup lacn2 lacn2: 1-norm estimate, e.g., || A^{-1} ||_1 in gecon + @defgroup lacon lacon: 1-norm estimate, e.g., || A^{-1} ||_1 in gecon, old + @defgroup la_lin_berr la_lin_berr: backward error + @} + @} + + @defgroup gels_top Least squares + @{ + @defgroup gels_driver_grp Standard least squares, min || Ax - b ||_2 + @{ + @defgroup gels gels: least squares using QR/LQ + @defgroup gelst gelst: least squares using QR/LQ with T matrix + @defgroup gelss gelss: least squares using SVD, QR iteration + @defgroup gelsd gelsd: least squares using SVD, divide and conquer + @defgroup gelsy gelsy: least squares using complete orthogonal factor + @defgroup getsls getsls: least squares using tall-skinny QR/LQ + @} + + @defgroup ggls_driver_grp Constrained least squares + @{ + @defgroup gglse gglse: equality-constrained least squares + @defgroup ggglm ggglm: Gauss-Markov linear model + @} + + @defgroup gels_aux_grp Auxiliary routines + @{ + @defgroup laic1 laic1: condition estimate, step in gelsy + @defgroup lals0 lals0: back multiplying factors, step in gelsd + @defgroup lalsa lalsa: SVD of coefficient matrix, step in gelsd + @defgroup lalsd lalsd: uses SVD for least squares, step in gelsd + @} + @} + + @defgroup unitary_top Orthogonal/unitary factors (QR, CS, etc.) + @{ + @defgroup geqr_comp_grp QR + @{ + @defgroup geqr_comp1 --- flexible --- + @defgroup geqr geqr: QR factor, flexible + @defgroup gemqr gemqr: multiply by Q from geqr + + @defgroup geqr_comp2 --- classic --- + @defgroup geqrf geqrf: QR factor + @defgroup geqr2 geqr2: QR factor, level 2 + @defgroup ungqr {un,or}gqr: generate explicit Q from geqrf + @defgroup ung2r {un,or}g2r: generate explicit Q from geqrf, level 2 + @defgroup unmqr {un,or}mqr: multiply by Q from geqrf + @defgroup unm2r {un,or}m2r: multiply by Q from geqrf, level 2 + + @defgroup geqr_comp3 --- with T --- + @defgroup geqrt geqrt: QR factor, with T + @defgroup geqrt2 geqrt2: QR factor, with T, level 2 + @defgroup geqrt3 geqrt3: QR factor, with T, recursive panel + @defgroup gemqrt gemqrt: multiply by Q from geqrt + + @defgroup geqr_comp4 --- positive --- + @defgroup geqrfp geqrfp: QR factor, diag( R ) ≥ 0 + @defgroup geqr2p geqr2p: QR factor, diag( R ) ≥ 0, level 2 + @} + + @defgroup geqpf_comp_grp QR with pivoting + @{ + @defgroup geqp3 geqp3: QR factor with pivoting, level 3 + @defgroup laqp2 laqp2: step of geqp3 + @defgroup laqps laqps: step of geqp3 + @} + + @defgroup getsqr_comp_grp QR, tall-skinny + @{ + @defgroup latsqr latsqr: tall-skinny QR factor + @defgroup ungtsqr {un,or}gtsqr: generate Q from latsqr + @defgroup ungtsqr_row {un,or}gtsqr_row: generate Q from latsqr + @defgroup larfb_gett larfb_gett: step in ungtsqr_row + @defgroup lamtsqr lamtsqr: multiply by Q from latsqr + @defgroup getsqrhrt getsqrhrt: tall-skinny QR factor, with Householder reconstruction + @defgroup unhr_col {un,or}hr_col: Householder reconstruction + @defgroup launhr_col_getrfnp la{un,or}hr_col_getrfnp: LU factor without pivoting + @defgroup launhr_col_getrfnp2 la{un,or}hr_col_getrfnp2: LU factor without pivoting, level 2 + @} + + @defgroup tpqr_comp_grp QR, triangular-pentagonal + @{ + @defgroup tpqrt tpqrt: QR factor + @defgroup tpqrt2 tpqrt2: QR factor, level 2 + @defgroup tpmqrt tpmqrt: applies Q + @defgroup tprfb tprfb: applies Q (like larfb) + @} + + @defgroup ggqr_comp_grp Generalized QR + @{ + @defgroup ggqrf ggqrf: Generalized QR factor + @} + + @defgroup gelq_comp_grp LQ + @{ + @defgroup gelq_comp1 --- flexible --- + @defgroup gelq gelq: LQ factor, flexible + @defgroup gemlq gemlq: multiply by Q from gelq + + @defgroup gelq_comp2 --- classic --- + @defgroup gelqf gelqf: LQ factor + @defgroup gelq2 gelq2: LQ factor, level 2 + @defgroup unglq {un,or}glq: generate explicit Q from gelqf + @defgroup ungl2 {un,or}gl2: generate explicit Q, level 2, step in unglq + @defgroup unmlq {un,or}mlq: multiply by Q from gelqf + @defgroup unml2 {un,or}ml2: multiply by Q, level 2, step in unmlq + + @defgroup gelq_comp3 --- with T --- + @defgroup gelqt gelqt: LQ factor, with T + @defgroup gelqt3 gelqt3: LQ factor, with T, recursive + @defgroup gemlqt gemlqt: multiply by Q from gelqt + @} + + @defgroup geswlq_comp_grp LQ, short-wide + @{ + @defgroup laswlq laswlq: short-wide LQ factor + @defgroup lamswlq lamswlq: multiply by Q from laswlq + @} + + @defgroup tplq_comp_grp LQ, triangular-pentagonal + @{ + @defgroup tplqt tplqt: QR factor + @defgroup tplqt2 tplqt2: QR factor, level 2 + @defgroup tpmlqt tpmlqt: applies Q + @} + + @defgroup geql_comp_grp QL + @{ + @defgroup geqlf geqlf: QL factor + @defgroup geql2 geql2: QL factor, level 2 + @defgroup ungql {un,or}gql: generate explicit Q from geqlf + @defgroup unmql {un,or}mql: multiply by Q from geqlf + @defgroup ung2l {un,or}g2l: step in ungql + @defgroup unm2l {un,or}m2l: step in unmql + @} + + @defgroup gerq_comp_grp RQ + @{ + @defgroup gerqf gerqf: RQ factor + @defgroup gerq2 gerq2: RQ factor, level 2 + @defgroup ungrq {un,or}grq: generate explicit Q from gerqf + @defgroup unmrq {un,or}mrq: multiply by Q from gerqf + @defgroup unmr2 {un,or}mr2: step in unmrq + @defgroup ungr2 {un,or}gr2: step in ungrq + @} + + @defgroup ggrq_comp_grp Generalized RQ + @{ + @defgroup ggrqf ggrqf: Generalized RQ factor + @} + + @defgroup gerz_comp_grp RZ + @{ + @defgroup tzrzf tzrzf: RZ factor + @defgroup latrz latrz: RZ factor step + @defgroup unmrz {un,or}mrz: multiply by Z from tzrzf + @defgroup unmr3 {un,or}mr3: step in unmrz + @defgroup larz larz: apply reflector + @defgroup larzb larzb: apply block reflector + @defgroup larzt larzt: generate T matrix + @} + + @defgroup gecs_comp_grp Cosine-Sine (CS) decomposition + @{ + @defgroup bbcsd bbcsd: ?? + @defgroup uncsd {un,or}csd: ?? + @defgroup uncsd2by1 {un,or}csd2by1: ?? + @defgroup unbdb {un,or}bdb: bidiagonalize partitioned unitary matrix, step in uncsd + @defgroup unbdb1 {un,or}bdb1: step in uncsd2by1 + @defgroup unbdb2 {un,or}bdb2: step in uncsd2by1 + @defgroup unbdb3 {un,or}bdb3: step in uncsd2by1 + @defgroup unbdb4 {un,or}bdb4: step in uncsd2by1 + @defgroup unbdb5 {un,or}bdb5: step in uncsd2by1 + @defgroup unbdb6 {un,or}bdb6: step in uncsd2by1 + + @defgroup lapmr lapmr: permute rows + @defgroup lapmt lapmt: permute cols + @} + + @defgroup reflector_aux_grp Householder reflectors + @{ + @defgroup larf larf: apply Householder reflector + @defgroup larfx larfx: apply Householder reflector, unrolled + @defgroup larfy larfy: apply Householder reflector symmetrically (2-sided) + @defgroup larfb larfb: apply block Householder reflector + @defgroup larfg larfg: generate Householder reflector + @defgroup larfgp larfgp: generate Householder reflector, beta ≥ 0 + @defgroup larft larft: generate T matrix + @} + + @defgroup rot_aux_grp Givens/Jacobi plane rotations + @{ + @defgroup lartg lartg: generate plane rotation, more accurate than BLAS rot + @defgroup lartgp lartgp: generate plane rotation, more accurate than BLAS rot + @defgroup lasr lasr: apply series of plane rotations + @defgroup largv largv: generate vector of plane rotations + @defgroup lartv lartv: apply vector of plane rotations to vectors + @defgroup lar2v lar2v: apply vector of plane rotations to 2x2 matrices + @defgroup lacrt lacrt: apply plane rotation (unused?) + @} + @} + + @defgroup geev_top Non-symmetric eigenvalues + @{ + @defgroup geev_driver_grp Standard eig driver, AV = VΛ + @{ + @defgroup geev geev: eig + @defgroup geevx geevx: eig, expert + + @defgroup gees gees: Schur form + @defgroup geesx geesx: Schur form, expert + @} + + @defgroup ggev_driver_grp Generalized eig driver + @{ + @defgroup ggev3 ggev3: eig + @defgroup ggev ggev: eig, unblocked + @defgroup ggevx ggevx: eig, expert + + @defgroup gges3 gges3: Schur form + @defgroup gges gges: Schur form, unblocked + @defgroup ggesx ggesx: Schur form, expert + @} + + @defgroup gedmd DMD driver, Dynamic Mode Decomposition + + @defgroup geev_comp_grp Eig computational routines + @{ + @defgroup gebal gebal: balance matrix + @defgroup gehrd gehrd: reduction to Hessenberg + @defgroup gehd2 gehd2: reduction to Hessenberg, level 2 + @defgroup lahr2 lahr2: step in gehrd + @defgroup unghr {un,or}ghr: generate Q from gehrd + @defgroup unmhr {un,or}mhr: multiply by Q from gehrd + @defgroup gebak gebak: back-transform eigvec + @defgroup hseqr hseqr: Hessenberg eig, QR iteration + @defgroup hsein hsein: Hessenberg inverse iteration for eigvec + @defgroup trevc trevc: eigenvectors of triangular Schur form, old + @defgroup trevc3 trevc3: eigenvectors of triangular Schur form, blocked + @defgroup laln2 laln2: 1x1 or 2x2 solve, step in trevc + + @defgroup trsyl trsyl: Sylvester equation + @defgroup trsyl3 trsyl3: Sylvester equation, level 3 + @defgroup lasy2 lasy2: Sylvester equation + + @defgroup trsna trsna: eig condition numbers + @defgroup laqtr laqtr: quasi-triangular solve + + @defgroup trexc trexc: reorder Schur form + @defgroup trsen trsen: reorder Schur form + @defgroup laexc laexc: reorder Schur form + + @defgroup lanv2 lanv2: 2x2 Schur factor + + @defgroup laqr_group --- hseqr auxiliary --- + @defgroup laein laein: eigvec by Hessenberg inverse iteration + @defgroup lahqr lahqr: eig of Hessenberg, step in hseqr + @defgroup laqr0 laqr0: eig of Hessenberg, step in hseqr + @defgroup laqr1 laqr1: step in hseqr + @defgroup laqr2 laqr2: step in hseqr + @defgroup laqr3 laqr3: step in hseqr + @defgroup laqr4 laqr4: eig of Hessenberg, step in hseqr + @defgroup laqr5 laqr5: step in hseqr + + @defgroup iparmq iparmq: set parameters for hseqr + + @defgroup laqz_group --- ggev3, gges3 auxiliary --- + @defgroup laqz0 laqz0: step in ggev3, gges3 + @defgroup laqz1 laqz1: step in ggev3, gges3 + @defgroup laqz2 laqz2: step in ggev3, gges3 + @defgroup laqz3 laqz3: step in ggev3, gges3 + @defgroup laqz4 laqz4: step in ggev3, gges3 + @} + + @defgroup ggev_comp_grp Generalized eig computational routines + @{ + @defgroup ggbal ggbal: balance matrix + @defgroup gghrd gghrd: reduction to Hessenberg + @defgroup gghd3 gghd3: reduction to Hessenberg, level 3 + @defgroup hgeqz hgeqz: generalized Hessenberg eig + @defgroup ggbak ggbak: back-transform eigvec + @defgroup tgsen tgsen: reorder generalized Schur form + @defgroup tgsna tgsna: reciprocal cond est + @defgroup tgsyl tgsyl: Sylvester equation + @defgroup tgsy2 tgsy2: Sylvester equation panel (?) + @defgroup unm22 {un,or}m22: multiply by banded Q, step in gghd3 + @defgroup lagv2 lagv2: 2x2 generalized Schur factor + @defgroup tgevc tgevc: eigvec of pair of matrices + @defgroup tgexc tgexc: reorder generalized Schur form + @defgroup tgex2 tgex2: reorder generalized Schur form + @} + @} + + @defgroup heev_top Hermitian/symmetric eigenvalues + @{ + @defgroup heev_driver_grp Standard eig driver, AV = VΛ + @{ + @defgroup heev_driver --- full --- + @defgroup heev {he,sy}ev: eig, QR iteration + @defgroup heevd {he,sy}evd: eig, divide and conquer + @defgroup heevr {he,sy}evr: eig, MRRR + @defgroup heevx {he,sy}evx: eig, bisection + + @defgroup heev_driver2 --- full, 2-stage --- + @defgroup heev_2stage {he,sy}ev_2stage: eig, QR iteration + @defgroup heevd_2stage {he,sy}evd_2stage: eig, divide and conquer + @defgroup heevr_2stage {he,sy}evr_2stage: eig, MRRR + @defgroup heevx_2stage {he,sy}evx_2stage: eig, bisection + + @defgroup hpev_driver --- packed --- + @defgroup hpev {hp,sp}ev: eig, QR iteration + @defgroup hpevd {hp,sp}evd: eig, divide and conquer + @defgroup hpevx {hp,sp}evx: eig, bisection + + @defgroup hbev_driver --- banded --- + @defgroup hbev {hb,sb}ev: eig, QR iteration + @defgroup hbevd {hb,sb}evd: eig, divide and conquer + @defgroup hbevx {hb,sb}evx: eig, bisection + + @defgroup hbev_driver2 --- banded, 2nd-stage --- + @defgroup hbev_2stage {hb,sb}ev_2stage: eig, QR iteration + @defgroup hbevd_2stage {hb,sb}evd_2stage: eig, divide and conquer + @defgroup hbevx_2stage {hb,sb}evx_2stage: eig, bisection + + @defgroup stev_driver --- tridiagonal --- + @defgroup stev stev: eig, QR iteration + @defgroup stevd stevd: eig, divide and conquer + @defgroup stevr stevr: eig, MRRR + @defgroup stevx stevx: eig, bisection + @defgroup pteqr pteqr: eig, positive definite tridiagonal + + @defgroup stebz stebz: eig, Kahan + @defgroup sterf sterf: eig, QR iteration + @defgroup stedc stedc: eig, divide and conquer + @defgroup stegr stegr: eig, bisection, see stemr + @defgroup stein stein: eig, inverse iteration + @defgroup stemr stemr: eig, relatively robust representation (RRR) + @defgroup steqr steqr: eig, QR iteration + @} + + @defgroup hegv_driver_grp Generalized eig driver, AV = BVΛ, etc. + @{ + @defgroup hegv_driver --- full --- + @defgroup hegv {he,sy}gv: eig, QR iteration + @defgroup hegv_2stage {he,sy}gv_2stage: eig, QR iteration, 2-stage + @defgroup hegvd {he,sy}gvd: eig, divide and conquer + @defgroup hegvx {he,sy}gvx: eig, bisection + + @defgroup hpgv_driver --- packed --- + @defgroup hpgv {hp,sp}gv: eig, QR iteration + @defgroup hpgvd {hp,sp}gvd: eig, divide and conquer + @defgroup hpgvx {hp,sp}gvx: eig, bisection + + @defgroup hbgv_driver --- banded --- + @defgroup hbgv {hb,sb}gv: eig, QR iteration + @defgroup hbgvd {hb,sb}gvd: eig, divide and conquer + @defgroup hbgvx {hb,sb}gvx: eig, bisection + @} + + @defgroup heev_comp_grp Eig computational routines + @{ + @defgroup heev_comp --- full --- + @defgroup disna disna: eig condition numbers + @defgroup hetrd {he,sy}trd: reduction to tridiagonal + @defgroup hetd2 {he,sy}td2: reduction to tridiagonal, level 2 + @defgroup latrd latrd: step in hetrd + @defgroup ungtr {un,or}gtr: generate Q from hetrd + @defgroup unmtr {un,or}mtr: multiply by Q from hetrd + + @defgroup hetrd_2stage {he,sy}trd_2stage: reduction to tridiagonal, 2-stage + @defgroup hetrd_he2hb {he,sy}trd_he2hb: full to band (1st stage) + @defgroup hetrd_hb2st {he,sy}trd_hb2st: band to tridiagonal (2nd stage) + @defgroup hb2st_kernels {hb,sb}2st_kernels: band to tridiagonal (2nd stage) + + @defgroup lae2 lae2: 2x2 eig, step in steqr, stemr + @defgroup laesy laesy: 2x2 eig + @defgroup laev2 laev2: 2x2 eig + @defgroup lagtf lagtf: LU factor of (T - λI) + @defgroup lagts lagts: LU solve of (T - λI) x = y + + @defgroup hpev_comp --- packed --- + @defgroup hptrd {hp,sp}trd: reduction to tridiagonal + @defgroup upgtr {up,op}gtr: generate Q from hetrd + @defgroup upmtr {up,op}mtr: multiply by Q from hptrd + + @defgroup hbev_comp --- banded --- + @defgroup hbtrd {hb,sb}trd: reduction to tridiagonal + @} + + @defgroup hegv_comp_grp Generalized eig computational routines + @{ + @defgroup hegst {he,sy}gst: reduction to standard form + @defgroup hegs2 {he,sy}gs2: reduction to standard form, level 2 + @defgroup hpgst {hp,sp}gst: reduction to standard form, packed + @defgroup hbgst {hb,sb}gst: reduction to standard form, banded + @defgroup pbstf pbstf: split Cholesky factor, use with hbgst + @defgroup lag2 lag2: 2x2 eig + @} + + @defgroup stev_comp_grp tridiag bisection routines + @{ + @defgroup laebz laebz: counts eigvals <= value + @defgroup laneg laneg: Sturm count + @} + + @defgroup laed_comp_grp tridiag divide and conquer (D&C) routines + @{ + @defgroup laed0 laed0: D&C step: top level solver + @defgroup laed1 laed1: D&C step: merge subproblems + @defgroup laed2 laed2: D&C step: deflation + @defgroup laed3 laed3: D&C step: secular equation + @defgroup laed4 laed4: D&C step: secular equation nonlinear solver + @defgroup laed5 laed5: D&C step: secular equation, 2x2 + @defgroup laed6 laed6: D&C step: secular equation Newton step + @defgroup lamrg lamrg: permutation to merge 2 sorted lists + + @defgroup laed_comp2 --- eig value only or update Q --- + @defgroup laed7 laed7: D&C step: merge subproblems + @defgroup laed8 laed8: D&C step: deflation + @defgroup laed9 laed9: D&C step: secular equation + @defgroup laeda laeda: D&C step: z vector + @} + + @defgroup larr_comp_grp tridiag RRR routines + @{ + @defgroup larra larra: step in stemr + @defgroup larrb larrb: step in stemr + @defgroup larrc larrc: step in stemr + @defgroup larrd larrd: step in stemr, tridiag eig + @defgroup larre larre: step in stemr + @defgroup larrf larrf: step in stemr, find relative robust representation (RRR) + @defgroup larrj larrj: step in stemr, refine eigval estimates + @defgroup larrk larrk: step in stemr, compute one eigval + @defgroup larrr larrr: step in stemr, test to do expensive tridiag eig algorithm + @defgroup larrv larrv: eig tridiagonal, step in stemr & stegr + @defgroup lar1v lar1v: step in larrv, hence stemr & stegr + @} + @} + + @defgroup svd_top Singular Value Decomposition (SVD) + @{ + @defgroup svd_driver_grp Standard SVD driver, A = UΣV^H + @{ + @defgroup gesvd_driver --- full --- + @defgroup gesvd gesvd: SVD, QR iteration + @defgroup gesvdq gesvdq: SVD, QR with pivoting + @defgroup gesdd gesdd: SVD, divide and conquer + @defgroup gesvdx gesvdx: SVD, bisection + @defgroup gejsv gejsv: SVD, Jacobi, high-level + @defgroup gesvj gesvj: SVD, Jacobi, low-level + + @defgroup bdsvd_driver --- bidiagonal --- + @defgroup bdsqr bdsqr: bidiagonal SVD, QR iteration (dqds) + @defgroup bdsdc bdsdc: bidiagonal SVD, divide and conquer + @defgroup bdsvdx bdsvdx: bidiagonal SVD, bisection + @} + + @defgroup ggsvd_driver_grp Generalized SVD driver + @{ + @defgroup ggsvd3 ggsvd3: SVD, QR iteration + @} + + @defgroup gesvd_comp_grp SVD computational routines + @{ + @defgroup gebrd gebrd: reduction to bidiagonal + @defgroup gebd2 gebd2: reduction to bidiagonal, level 2 + @defgroup labrd labrd: step in gebrd + @defgroup gbbrd gbbrd: band to bidiagonal + @defgroup ungbr {un,or}gbr: generate Q, P from gebrd + @defgroup unmbr {un,or}mbr: multiply by Q, P from gebrd + + @defgroup gesvd_aux --- auxiliary routines --- + @defgroup gsvj0 gsvj0: step in gesvj + @defgroup gsvj1 gsvj1: step in gesvj + @defgroup las2 las2: 2x2 triangular SVD + @defgroup lasv2 lasv2: 2x2 triangular SVD + @defgroup lartgs lartgs: generate plane rotation for bidiag SVD + @} + + @defgroup ggsvd_comp_grp Generalized SVD computational routines + @{ + @defgroup ggsvp3 ggsvp3: step in ggsvd + @defgroup tgsja tgsja: generalized SVD of trapezoidal matrices, step in ggsvd3 + @defgroup lags2 lags2: 2x2 orthogonal factor, step in tgsja + @defgroup lapll lapll: linear dependence of 2 vectors + @} + + @defgroup lasq_comp_grp bidiag QR iteration routines + @{ + @defgroup lasq1 lasq1: dqds step + @defgroup lasq2 lasq2: dqds step + @defgroup lasq3 lasq3: dqds step + @defgroup lasq4 lasq4: dqds step + @defgroup lasq5 lasq5: dqds step + @defgroup lasq6 lasq6: dqds step + @} + + @defgroup lasd_comp_grp bidiag D&C routines + @{ + @defgroup lasd0 lasd0: D&C step: top level solver + @defgroup lasdt lasdt: D&C step: tree + @defgroup lasd1 lasd1: D&C step: merge subproblems + @defgroup lasd2 lasd2: D&C step: deflation + @defgroup lasd3 lasd3: D&C step: secular equation + @defgroup lasd4 lasd4: D&C step: secular equation nonlinear solver + @defgroup lasd5 lasd5: D&C step: secular equation, 2x2 + @defgroup lasdq lasdq: D&C step: leaf using bdsqr + + @defgroup lasd_comp2 --- singular values only or factored form --- + @defgroup lasda lasda: D&C step: top level solver + @defgroup lasd6 lasd6: D&C step: merge subproblems + @defgroup lasd7 lasd7: D&C step: deflation + @defgroup lasd8 lasd8: D&C step: secular equation + @} + @} + + @defgroup blas_like_top BLAS-like + @{ + @defgroup set_grp Initialize, copy, convert + @{ + @defgroup laset laset: set matrix + @defgroup larnv larnv: random vector + @defgroup laruv laruv: random uniform vector + + @defgroup lacpy lacpy: copy matrix + + @defgroup lacp2 lacp2: general matrix, convert real to complex + @defgroup _lag2_ _lag2_: general matrix, convert double <=> single + @defgroup _lat2_ _lat2_: triangular matrix, convert double <=> single + + @defgroup tfttp tfttp: triangular matrix, RFP (tf) to packed (tp) + @defgroup tfttr tfttr: triangular matrix, RFP (tf) to full (tr) + @defgroup tpttf tpttf: triangular matrix, packed (tp) to RFP (tf) + @defgroup tpttr tpttr: triangular matrix, packed (tp) to full (tr) + @defgroup trttf trttf: triangular matrix, full (tr) to RFP (tf) + @defgroup trttp trttp: triangular matrix, full (tr) to packed (tp) + @} + + @defgroup norm_grp Matrix norm + @{ + @defgroup lange lange: general matrix + @defgroup langb langb: general matrix, banded + @defgroup langt langt: general matrix, tridiagonal + + @defgroup lanhs lanhs: Hessenberg + + @defgroup lanhe lan{he,sy}: Hermitian/symmetric matrix + @defgroup lanhf lan{hf,sf}: Hermitian/symmetric matrix, RFP + @defgroup lanhp lan{hp,sp}: Hermitian/symmetric matrix, packed + @defgroup lanhb lan{hb,sb}: Hermitian/symmetric matrix, banded + @defgroup lanht lan{ht,st}: Hermitian/symmetric matrix, tridiagonal + + @defgroup lantr lantr: triangular matrix + @defgroup lantp lantp: triangular matrix, packed + @defgroup lantb lantb: triangular matrix, banded + @} + + @defgroup blas0_like_grp Scalar operations + @{ + @defgroup isnan isnan: test for NaN + @defgroup laisnan laisnan: test for NaN, unoptimized + @defgroup ladiv ladiv: complex divide + @defgroup lapy2 lapy2: robust sqrt( x^2 + y^2 ) + @defgroup lapy3 lapy3: robust sqrt( x^2 + y^2 + z^2 ) + @defgroup larmm larmm: scale factor to avoid overflow, step in latrs + @} + + @defgroup blas1_like_grp Level 1 BLAS-like vector ops + @{ + @defgroup lacgv lacgv: conjugate vector + @defgroup lasrt lasrt: sort vector + @defgroup lassq lassq: sum-of-squares, avoiding over/underflow + @defgroup rscl rscl: scale vector by reciprocal + @} + + @defgroup blas2_like_grp Level 2 BLAS-like matrix-vector ops + @{ + @defgroup ilalc ilalc: find non-zero col + @defgroup ilalr ilalr: find non-zero row + @defgroup lascl lascl: scale matrix + @defgroup la_geamv la_geamv: matrix-vector multiply |A| * |x|, general + @defgroup la_gbamv la_gbamv: matrix-vector multiply |A| * |x|, general banded + @defgroup la_heamv la_heamv: matrix-vector multiply |A| * |x|, Hermitian/symmetric + @defgroup lascl2 lascl2: diagonal scale matrix, A = D A + @defgroup larscl2 larscl2: reciprocal diagonal scale matrix, A = D^{-1} A + @defgroup la_wwaddw la_wwaddw: add to double-double or single-single vector + @} + + @defgroup blas3_like_grp Level 3 BLAS-like matrix-matrix ops + @{ + @defgroup lagtm lagtm: tridiagonal matrix-matrix multiply + @defgroup lacrm lacrm: complex * real matrix-matrix multiply + @defgroup larcm larcm: real * complex matrix-matrix multiply + @defgroup hfrk hfrk: Hermitian rank-k update, RFP format + @defgroup tfsm tfsm: triangular-matrix solve, RFP format + @} + @} + + @defgroup aux_top Auxiliary routines + @{ + @defgroup aux_grp Other auxiliary routines + @{ + @defgroup lsame lsame: string comparison + @defgroup lsamen lsamen: string comparison + @defgroup roundup_lwork roundup_lwork: fix rounding integer to float + @defgroup second second: wall clock timer + @} + + @defgroup params_grp Parameters + @{ + @defgroup lamch lamch: machine parameters + @defgroup lamc1 lamc1: ?? + @defgroup lamc2 lamc2: ?? + @defgroup lamc3 lamc3: ?? + @defgroup lamc4 lamc4: ?? + @defgroup lamc5 lamc5: ?? + @defgroup labad labad: over/underflow on obsolete pre-IEEE machines + @defgroup ilaver ilaver: LAPACK version + @defgroup ilaenv ilaenv: tuning parameters + @defgroup ilaenv2stage ilaenv2stage: tuning parameters for 2-stage eig + @defgroup iparam2stage iparam2stage: sets parameters for 2-stage eig + @defgroup ieeeck ieeeck: verify inf and NaN are safe + @defgroup la_constants la_constants: Fortran 95 module of constants + + @defgroup blast_aux --- BLAST constants --- + @defgroup iladiag iladiag: diag string to BLAST const + @defgroup ilaprec ilaprec: precision string to BLAST const + @defgroup ilatrans ilatrans: trans string to BLAST const + @defgroup ilauplo ilauplo: uplo string to BLAST const + @defgroup la_transtype la_transtype: BLAST const to string + @} + + @defgroup xerbla_grp Error reporting + @{ + @defgroup xerbla xerbla: error reporting + @defgroup xerbla_array xerbla_array: error reporting, callable from C + @} + @} +@} + +@defgroup blas_top BLAS +BLAS are defined by three papers: +Basic linear algebra subprograms for {FORTRAN} usage, Lawson et al, 1979. +An extended set of {FORTRAN} basic linear algebra subprograms, Dongarra et al, 1988. +A set of level 3 basic linear algebra subprograms, Dongarra et al, 1990. +Some BLAS-like routines (e.g., csymv, crot, csum1, icmax1) exist in +LAPACK rather than the classic BLAS. +These were extended by the Extra Precision BLAS (XBLAS, not documented here) +https://www.netlib.org/xblas/ + +@{ + @defgroup scalar_grp Scalar operations + @{ + @defgroup abs1 abs1: | real( x ) | + | imag( x ) | + @} + + @defgroup blas1_grp Level 1 BLAS: vector ops + @{ + @defgroup asum asum: sum | real( x_i ) | + | imag( x_i ) | + @defgroup sum1 sum1: sum | x_i | (in LAPACK) + @defgroup axpy axpy: y = ax + y + @defgroup copy copy: y = x + @defgroup dot dot: x^H x and x^T x + @defgroup iamax iamax: argmax_i | real( x_i ) | + | imag( x_i ) | + @defgroup imax1 imax1: argmax_i | x_i | (in LAPACK) + @defgroup nrm2 nrm2: || x ||_2 + @defgroup scal scal: x = alpha x + @defgroup swap swap: x <=> y + + @defgroup rot_comp --- Givens/Jacobi plane rotations --- + @defgroup rot rot: apply plane rotation ([cz]rot in LAPACK) + @defgroup rotg rotg: generate plane rotation (cf. lartg) + @defgroup rotm rotm: apply modified (fast) plane rotation + @defgroup rotmg rotmg: generate modified (fast) plane rotation + @} + + @defgroup blas2_grp Level 2 BLAS: matrix-vector ops + @{ + @defgroup blas2_full --- full --- + @defgroup gemv gemv: general matrix-vector multiply + @defgroup ger ger: general matrix rank-1 update + + @defgroup hemv {he,sy}mv: Hermitian/symmetric matrix-vector multiply ([cz]symv in LAPACK) + @defgroup her {he,sy}r: Hermitian/symmetric rank-1 update + @defgroup her2 {he,sy}r2: Hermitian/symmetric rank-2 update + + @defgroup trmv trmv: triangular matrix-vector multiply + @defgroup trsv trsv: triangular matrix-vector solve + + @defgroup blas2_packed --- packed --- + @defgroup hpmv {hp,sp}mv: Hermitian/symmetric matrix-vector multiply + @defgroup hpr {hp,sp}r: Hermitian/symmetric rank-1 update + @defgroup hpr2 {hp,sp}r2: Hermitian/symmetric rank-2 update + + @defgroup tpmv tpmv: triangular matrix-vector multiply + @defgroup tpsv tpsv: triangular matrix-vector solve + + @defgroup blas2_banded --- banded --- + @defgroup gbmv gbmv: general matrix-vector multiply + + @defgroup hbmv {hb,sb}mv: Hermitian/symmetric matrix-vector multiply + + @defgroup tbmv tbmv: triangular matrix-vector multiply + @defgroup tbsv tbsv: triangular matrix-vector solve + @} + + @defgroup blas3_grp Level 3 BLAS: matrix-matrix ops + @{ + @defgroup gemm gemm: general matrix-matrix multiply + + @defgroup hemm {he,sy}mm: Hermitian/symmetric matrix-matrix multiply + @defgroup herk {he,sy}rk: Hermitian/symmetric rank-k update + @defgroup her2k {he,sy}r2k: Hermitian/symmetric rank-2k update + + @defgroup trmm trmm: triangular matrix-matrix multiply + @defgroup trsm trsm: triangular matrix-matrix solve + @} +@} + **/ diff --git a/lapack-netlib/INSTALL/ilaver.c b/lapack-netlib/INSTALL/ilaver.c index 184e9b78d..d64c841a2 100644 --- a/lapack-netlib/INSTALL/ilaver.c +++ b/lapack-netlib/INSTALL/ilaver.c @@ -315,7 +315,6 @@ typedef struct Namelist Namelist; /* > \author Univ. of Colorado Denver */ /* > \author NAG Ltd. */ -/* > \date November 2019 */ /* > \ingroup auxOTHERauxiliary */ @@ -332,7 +331,7 @@ typedef struct Namelist Namelist; /* ===================================================================== */ *vers_major__ = 3; - *vers_minor__ = 11; + *vers_minor__ = 12; *vers_patch__ = 0; /* ===================================================================== */ diff --git a/lapack-netlib/INSTALL/ilaver.f b/lapack-netlib/INSTALL/ilaver.f index a246c37cb..1827d5cd2 100644 --- a/lapack-netlib/INSTALL/ilaver.f +++ b/lapack-netlib/INSTALL/ilaver.f @@ -44,7 +44,6 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 * *> \ingroup auxOTHERauxiliary * @@ -60,7 +59,7 @@ INTEGER VERS_MAJOR, VERS_MINOR, VERS_PATCH * ===================================================================== VERS_MAJOR = 3 - VERS_MINOR = 11 + VERS_MINOR = 12 VERS_PATCH = 0 * ===================================================================== * diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h index c64fc4416..798a5eb2e 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_config.h +++ b/lapack-netlib/LAPACKE/include/lapacke_config.h @@ -68,6 +68,17 @@ extern "C" { #endif #ifndef LAPACK_COMPLEX_CUSTOM +#if defined(_MSC_VER) + #define _CRT_USE_C_COMPLEX_H + #include + #define LAPACK_COMPLEX_CUSTOM + #define lapack_complex_float _Fcomplex + #define lapack_complex_double _Dcomplex + #define lapack_complex_float_real(z) (creal(z)) + #define lapack_complex_float_imag(z) (cimag(z)) + #define lapack_complex_double_real(z) (creal(z)) + #define lapack_complex_double_imag(z) (cimag(z)) +#else #if defined(LAPACK_COMPLEX_STRUCTURE) @@ -109,6 +120,7 @@ typedef struct { double real, imag; } _lapack_complex_double; #define lapack_complex_double_real(z) (creal(z)) #define lapack_complex_double_imag(z) (cimag(z)) +#endif #endif lapack_complex_float lapack_make_complex_float( float re, float im ); diff --git a/lapack-netlib/LICENSE b/lapack-netlib/LICENSE index 94cdb0f85..96b04c988 100644 --- a/lapack-netlib/LICENSE +++ b/lapack-netlib/LICENSE @@ -1,9 +1,9 @@ -Copyright (c) 1992-2017 The University of Tennessee and The University +Copyright (c) 1992-2023 The University of Tennessee and The University of Tennessee Research Foundation. All rights reserved. -Copyright (c) 2000-2017 The University of California Berkeley. All +Copyright (c) 2000-2023 The University of California Berkeley. All rights reserved. -Copyright (c) 2006-2017 The University of Colorado Denver. All rights +Copyright (c) 2006-2023 The University of Colorado Denver. All rights reserved. $COPYRIGHT$ diff --git a/lapack-netlib/README.md b/lapack-netlib/README.md index 142aa7b72..a00d4c51d 100644 --- a/lapack-netlib/README.md +++ b/lapack-netlib/README.md @@ -6,7 +6,7 @@ [![Appveyor](https://ci.appveyor.com/api/projects/status/bh38iin398msrbtr?svg=true)](https://ci.appveyor.com/project/langou/lapack/) [![codecov](https://codecov.io/gh/Reference-LAPACK/lapack/branch/master/graph/badge.svg)](https://codecov.io/gh/Reference-LAPACK/lapack) [![Packaging status](https://repology.org/badge/tiny-repos/lapack.svg)](https://repology.org/metapackage/lapack/versions) - +[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/Reference-LAPACK/lapack/badge)](https://securityscorecards.dev/viewer/?uri=github.com/Reference-LAPACK/lapack) * VERSION 1.0 : February 29, 1992 * VERSION 1.0a : June 30, 1992 @@ -37,6 +37,7 @@ * VERSION 3.10.0 : June 2021 * VERSION 3.10.1 : April 2022 * VERSION 3.11.0 : November 2022 +* VERSION 3.12.0 : November 2023 LAPACK is a library of Fortran subroutines for solving the most commonly occurring problems in numerical linear algebra. diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 448fbd8df..de2242701 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -136,7 +136,7 @@ SLASRC_O = \ sgebrd.o sgecon.o sgeequ.o sgees.o sgeesx.o sgeev.o sgeevx.o \ sgehd2.o sgehrd.o sgelq2.o sgelqf.o \ sgels.o sgelsd.o sgelss.o sgelsy.o sgeql2.o sgeqlf.o \ - sgeqp3.o sgeqr2.o sgeqr2p.o sgeqrf.o sgeqrfp.o sgerfs.o \ + sgeqp3.o sgeqp3rk.o sgeqr2.o sgeqr2p.o sgeqrf.o sgeqrfp.o sgerfs.o \ sgerq2.o sgerqf.o sgesc2.o sgesdd.o sgesv.o sgesvd.o sgesvdx.o sgesvx.o \ sgetc2.o sgetf2.o sgetri.o \ sggbak.o sggbal.o sgges.o sgges3.o sggesx.o \ @@ -151,7 +151,7 @@ SLASRC_O = \ slangb.o slange.o slangt.o slanhs.o slansb.o slansp.o \ slansy.o slantb.o slantp.o slantr.o slanv2.o \ slapll.o slapmt.o \ - slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \ + slaqgb.o slaqge.o slaqp2.o slaqp2rk.o slaqp3rk.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \ slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \ slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \ slarf.o slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \ @@ -232,7 +232,7 @@ CLASRC_O = \ cgbtf2.o cgbtrf.o cgbtrs.o cgebak.o cgebal.o cgebd2.o cgebrd.o \ cgecon.o cgeequ.o cgees.o cgeesx.o cgeev.o cgeevx.o \ cgehd2.o cgehrd.o cgelq2.o cgelqf.o \ - cgels.o cgelsd.o cgelss.o cgelsy.o cgeql2.o cgeqlf.o cgeqp3.o \ + cgels.o cgelsd.o cgelss.o cgelsy.o cgeql2.o cgeqlf.o cgeqp3.o cgeqp3rk.o \ cgeqr2.o cgeqr2p.o cgeqrf.o cgeqrfp.o cgerfs.o \ cgerq2.o cgerqf.o cgesc2.o cgesdd.o cgesv.o cgesvd.o cgesvdx.o \ cgesvj.o cgejsv.o cgsvj0.o cgsvj1.o \ @@ -266,7 +266,7 @@ CLASRC_O = \ clanhb.o clanhe.o \ clanhp.o clanhs.o clanht.o clansb.o clansp.o clansy.o clantb.o \ clantp.o clantr.o clapll.o clapmt.o clarcm.o claqgb.o claqge.o \ - claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqsb.o \ + claqhb.o claqhe.o claqhp.o claqp2.o claqp2rk.o claqp3rk.o claqps.o claqsb.o \ claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \ claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \ claqz0.o claqz1.o claqz2.o claqz3.o \ @@ -345,7 +345,7 @@ DLASRC_O = \ dgebrd.o dgecon.o dgeequ.o dgees.o dgeesx.o dgeev.o dgeevx.o \ dgehd2.o dgehrd.o dgelq2.o dgelqf.o \ dgels.o dgelsd.o dgelss.o dgelsy.o dgeql2.o dgeqlf.o \ - dgeqp3.o dgeqr2.o dgeqr2p.o dgeqrf.o dgeqrfp.o dgerfs.o \ + dgeqp3.o dgeqp3rk.o dgeqr2.o dgeqr2p.o dgeqrf.o dgeqrfp.o dgerfs.o \ dgerq2.o dgerqf.o dgesc2.o dgesdd.o dgesv.o dgesvd.o dgesvdx.o dgesvx.o \ dgetc2.o dgetf2.o dgetrf.o dgetri.o \ dgetrs.o dggbak.o dggbal.o dgges.o dgges3.o dggesx.o \ @@ -360,7 +360,7 @@ DLASRC_O = \ dlangb.o dlange.o dlangt.o dlanhs.o dlansb.o dlansp.o \ dlansy.o dlantb.o dlantp.o dlantr.o dlanv2.o \ dlapll.o dlapmt.o \ - dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \ + dlaqgb.o dlaqge.o dlaqp2.o dlaqp2rk.o dlaqp3rk.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \ dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \ dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \ dlarf.o dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \ @@ -437,7 +437,7 @@ ZLASRC_O = \ zgbtf2.o zgbtrf.o zgbtrs.o zgebak.o zgebal.o zgebd2.o zgebrd.o \ zgecon.o zgeequ.o zgees.o zgeesx.o zgeev.o zgeevx.o \ zgehd2.o zgehrd.o zgelq2.o zgelqf.o \ - zgels.o zgelsd.o zgelss.o zgelsy.o zgeql2.o zgeqlf.o zgeqp3.o \ + zgels.o zgelsd.o zgelss.o zgelsy.o zgeql2.o zgeqlf.o zgeqp3.o zgeqp3rk.o \ zgeqr2.o zgeqr2p.o zgeqrf.o zgeqrfp.o zgerfs.o zgerq2.o zgerqf.o \ zgesc2.o zgesdd.o zgesv.o zgesvd.o zgesvdx.o \ zgesvj.o zgejsv.o zgsvj0.o zgsvj1.o \ @@ -473,7 +473,7 @@ ZLASRC_O = \ zlanhe.o \ zlanhp.o zlanhs.o zlanht.o zlansb.o zlansp.o zlansy.o zlantb.o \ zlantp.o zlantr.o zlapll.o zlapmt.o zlaqgb.o zlaqge.o \ - zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqsb.o \ + zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqp2rk.o zlaqp3rk.o zlaqps.o zlaqsb.o \ zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \ zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \ zlaqz0.o zlaqz1.o zlaqz2.o zlaqz3.o \ diff --git a/lapack-netlib/SRC/cbbcsd.f b/lapack-netlib/SRC/cbbcsd.f index 1cd1ffbf1..4d0c45efe 100644 --- a/lapack-netlib/SRC/cbbcsd.f +++ b/lapack-netlib/SRC/cbbcsd.f @@ -805,7 +805,7 @@ CALL SLARTGP( B22BULGE, B22E(I-1), RWORK(IU2SN+I-1), $ RWORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL SLARTGS( B21E(I), B21E(I+1), NU, RWORK(IU2CS+I-1), + CALL SLARTGS( B21E(I), B21D(I+1), NU, RWORK(IU2CS+I-1), $ RWORK(IU2SN+I-1) ) ELSE CALL SLARTGS( B22D(I), B22E(I), MU, RWORK(IU2CS+I-1), diff --git a/lapack-netlib/SRC/cgebrd.f b/lapack-netlib/SRC/cgebrd.f index 5687161a5..5920b1cf5 100644 --- a/lapack-netlib/SRC/cgebrd.f +++ b/lapack-netlib/SRC/cgebrd.f @@ -123,7 +123,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -148,7 +149,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup gebrd * *> \par Further Details: * ===================== @@ -225,8 +226,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL CGEBD2, CGEMM, CLABRD, XERBLA @@ -236,16 +237,24 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB - WORK( 1 ) = REAL( LWKOPT ) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -253,7 +262,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -265,7 +274,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -284,7 +292,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using @@ -343,7 +351,7 @@ * CALL CGEBD2( M-I+1, N-I+1, A( I, I ), LDA, D( I ), E( I ), $ TAUQ( I ), TAUP( I ), WORK, IINFO ) - WORK( 1 ) = WS + WORK( 1 ) = SROUNDUP_LWORK( WS ) RETURN * * End of CGEBRD diff --git a/lapack-netlib/SRC/cgedmd.f90 b/lapack-netlib/SRC/cgedmd.f90 index 499489270..1413130ec 100644 --- a/lapack-netlib/SRC/cgedmd.f90 +++ b/lapack-netlib/SRC/cgedmd.f90 @@ -1,22 +1,526 @@ +!> \brief \b CGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== +! +! SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, EIGS, Z, LDZ, RES, B, LDB, & +! W, LDW, S, LDS, ZWORK, LZWORK, & +! RWORK, LRWORK, IWORK, LIWORK, INFO ) +!..... +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real32 +! +!..... +! Scalar arguments +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LIWORK, LRWORK, LZWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments +! COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) +! COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) +! REAL(KIND=WP), INTENT(OUT) :: RES(*) +! REAL(KIND=WP), INTENT(OUT) :: RWORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) +! +!............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> CGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, CGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, CGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim +!...................................................................... +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim +!...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim +!...................................................................... +! Arguments +! ========= +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim +!..... +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim +!..... +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim +!..... +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim +!..... +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: CGESVD (the QR SVD algorithm) +!> 2 :: CGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: CGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim +!..... +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim +!..... +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim +!..... +!> \param[in,out] X +!> \verbatim +!> X (input/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. +!> \endverbatim +!..... +!> \param[in] LDX +!> \verbatim +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim +!..... +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim +!..... +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim +!..... +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1) +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim +!..... +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim +!..... +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim +!..... +!> \param[out] EIGS +!> \verbatim +!> EIGS (output) COMPLEX(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of EIGS contain +!> the computed eigenvalues (Ritz values). +!> See the descriptions of K, and Z. +!> \endverbatim +!..... +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) COMPLEX(KIND=WP) M-by-N array +!> If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) +!> is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. +!> If JOBZ == 'F', then the Z(:,i)'s are given implicitly as +!> the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) +!> is an eigenvector corresponding to EIGS(i). The columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. +!> See the descriptions of EIGS, X and W. +!> \endverbatim +!..... +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim +!..... +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs, +!> RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. +!> See the description of EIGS and Z. +!> \endverbatim +!..... +!> \param[out] B +!> \verbatim +!> B (output) COMPLEX(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1:K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim +!..... +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim +!..... +!> \param[out] W +!> \verbatim +!> W (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient. +!> The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> right singular vectors of X. +!> \endverbatim +!..... +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim +!..... +!> \param[out] S +!> \verbatim +!> S (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by CGEEV. +!> See the description of K. +!> \endverbatim +!..... +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim +!..... +!> \param[out] ZWORK +!> \verbatim +!> ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array +!> ZWORK is used as complex workspace in the complex SVD, as +!> specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing +!> the eigenvalues of a Rayleigh quotient. +!> If the call to CGEDMD is only workspace query, then +!> ZWORK(1) contains the minimal complex workspace length and +!> ZWORK(2) is the optimal complex workspace length. +!> Hence, the length of work is at least 2. +!> See the description of LZWORK. +!> \endverbatim +!..... +!> \param[in] LZWORK +!> \verbatim +!> LZWORK (input) INTEGER +!> The minimal length of the workspace vector ZWORK. +!> LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV), +!> where LZWORK_CGEEV = MAX( 1, 2*N ) and the minimal +!> LZWORK_SVD is calculated as follows +!> If WHTSVD == 1 :: CGESVD :: +!> LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) +!> If WHTSVD == 2 :: CGESDD :: +!> LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) +!> If WHTSVD == 3 :: CGESVDQ :: +!> LZWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: CGEJSV :: +!> LZWORK_SVD = obtainable by a query +!> If on entry LZWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths and returns them in +!> LZWORK(1) and LZWORK(2), respectively. +!> \endverbatim +!..... +!> \param[out] RWORK +!> \verbatim +!> RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array +!> On exit, RWORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain +!> scaling factor RWORK(N+2)/RWORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to CGEDMD is only workspace query, then +!> RWORK(1) contains the minimal workspace length. +!> See the description of LRWORK. +!> \endverbatim +!..... +!> \param[in] LRWORK +!> \verbatim +!> LRWORK (input) INTEGER +!> The minimal length of the workspace vector RWORK. +!> LRWORK is calculated as follows: +!> LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where +!> LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace +!> for the SVD subroutine determined by the input parameter +!> WHTSVD. +!> If WHTSVD == 1 :: CGESVD :: +!> LRWORK_SVD = 5*MIN(M,N) +!> If WHTSVD == 2 :: CGESDD :: +!> LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), +!> 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) +!> If WHTSVD == 3 :: CGESVDQ :: +!> LRWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: CGEJSV :: +!> LRWORK_SVD = obtainable by a query +!> If on entry LRWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> real workspace length and returns it in RWORK(1). +!> \endverbatim +!..... +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim +!..... +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for ZWORK, RWORK and +!> IWORK. See the descriptions of ZWORK, RWORK and IWORK. +!> \endverbatim +!..... +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! +!............................................................. +!............................................................. SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & M, N, X, LDX, Y, LDY, NRNK, TOL, & K, EIGS, Z, LDZ, RES, B, LDB, & W, LDW, S, LDS, ZWORK, LZWORK, & RWORK, LRWORK, IWORK, LIWORK, INFO ) -! March 2023 +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! !..... USE iso_fortran_env IMPLICIT NONE INTEGER, PARAMETER :: WP = real32 -!..... +! ! Scalar arguments +! ~~~~~~~~~~~~~~~~ CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & NRNK, LDZ, LDB, LDW, LDS, & LIWORK, LRWORK, LZWORK INTEGER, INTENT(OUT) :: K, INFO REAL(KIND=WP), INTENT(IN) :: TOL +! ! Array arguments +! ~~~~~~~~~~~~~~~ COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & W(LDW,*), S(LDS,*) @@ -25,364 +529,14 @@ REAL(KIND=WP), INTENT(OUT) :: RES(*) REAL(KIND=WP), INTENT(OUT) :: RWORK(*) INTEGER, INTENT(OUT) :: IWORK(*) -!............................................................ -! Purpose -! ======= -! CGEDMD computes the Dynamic Mode Decomposition (DMD) for -! a pair of data snapshot matrices. For the input matrices -! X and Y such that Y = A*X with an unaccessible matrix -! A, CGEDMD computes a certain number of Ritz pairs of A using -! the standard Rayleigh-Ritz extraction from a subspace of -! range(X) that is determined using the leading left singular -! vectors of X. Optionally, CGEDMD returns the residuals -! of the computed Ritz pairs, the information needed for -! a refinement of the Ritz vectors, or the eigenvectors of -! the Exact DMD. -! For further details see the references listed -! below. For more details of the implementation see [3]. -! -! References -! ========== -! [1] P. Schmid: Dynamic mode decomposition of numerical -! and experimental data, -! Journal of Fluid Mechanics 656, 5-28, 2010. -! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal -! decompositions: analysis and enhancements, -! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. -! [3] Z. Drmac: A LAPACK implementation of the Dynamic -! Mode Decomposition I. Technical report. AIMDyn Inc. -! and LAPACK Working Note 298. -! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. -! Brunton, N. Kutz: On Dynamic Mode Decomposition: -! Theory and Applications, Journal of Computational -! Dynamics 1(2), 391 -421, 2014. ! -!...................................................................... -! Developed and supported by: -! =========================== -! Developed and coded by Zlatko Drmac, Faculty of Science, -! University of Zagreb; drmac@math.hr -! In cooperation with -! AIMdyn Inc., Santa Barbara, CA. -! and supported by -! - DARPA SBIR project "Koopman Operator-Based Forecasting -! for Nonstationary Processes from Near-Term, Limited -! Observational Data" Contract No: W31P4Q-21-C-0007 -! - DARPA PAI project "Physics-Informed Machine Learning -! Methodologies" Contract No: HR0011-18-9-0033 -! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic -! Framework for Space-Time Analysis of Process Dynamics" -! Contract No: HR0011-16-C-0116 -! Any opinions, findings and conclusions or recommendations -! expressed in this material are those of the author and -! do not necessarily reflect the views of the DARPA SBIR -! Program Office -!============================================================ -! Distribution Statement A: -! Approved for Public Release, Distribution Unlimited. -! Cleared by DARPA on September 29, 2022 -!============================================================ -!...................................................................... -! Arguments -! ========= -! JOBS (input) CHARACTER*1 -! Determines whether the initial data snapshots are scaled -! by a diagonal matrix. -! 'S' :: The data snapshots matrices X and Y are multiplied -! with a diagonal matrix D so that X*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'C' :: The snapshots are scaled as with the 'S' option. -! If it is found that an i-th column of X is zero -! vector and the corresponding i-th column of Y is -! non-zero, then the i-th column of Y is set to -! zero and a warning flag is raised. -! 'Y' :: The data snapshots matrices X and Y are multiplied -! by a diagonal matrix D so that Y*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'N' :: No data scaling. -!..... -! JOBZ (input) CHARACTER*1 -! Determines whether the eigenvectors (Koopman modes) will -! be computed. -! 'V' :: The eigenvectors (Koopman modes) will be computed -! and returned in the matrix Z. -! See the description of Z. -! 'F' :: The eigenvectors (Koopman modes) will be returned -! in factored form as the product X(:,1:K)*W, where X -! contains a POD basis (leading left singular vectors -! of the data matrix X) and W contains the eigenvectors -! of the corresponding Rayleigh quotient. -! See the descriptions of K, X, W, Z. -! 'N' :: The eigenvectors are not computed. -!..... -! JOBR (input) CHARACTER*1 -! Determines whether to compute the residuals. -! 'R' :: The residuals for the computed eigenpairs will be -! computed and stored in the array RES. -! See the description of RES. -! For this option to be legal, JOBZ must be 'V'. -! 'N' :: The residuals are not computed. -!..... -! JOBF (input) CHARACTER*1 -! Specifies whether to store information needed for post- -! processing (e.g. computing refined Ritz vectors) -! 'R' :: The matrix needed for the refinement of the Ritz -! vectors is computed and stored in the array B. -! See the description of B. -! 'E' :: The unscaled eigenvectors of the Exact DMD are -! computed and returned in the array B. See the -! description of B. -! 'N' :: No eigenvector refinement data is computed. -!..... -! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } -! Allows for a selection of the SVD algorithm from the -! LAPACK library. -! 1 :: CGESVD (the QR SVD algorithm) -! 2 :: CGESDD (the Divide and Conquer algorithm; if enough -! workspace available, this is the fastest option) -! 3 :: CGESVDQ (the preconditioned QR SVD ; this and 4 -! are the most accurate options) -! 4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 -! are the most accurate options) -! For the four methods above, a significant difference in -! the accuracy of small singular values is possible if -! the snapshots vary in norm so that X is severely -! ill-conditioned. If small (smaller than EPS*||X||) -! singular values are of interest and JOBS=='N', then -! the options (3, 4) give the most accurate results, where -! the option 4 is slightly better and with stronger -! theoretical background. -! If JOBS=='S', i.e. the columns of X will be normalized, -! then all methods give nearly equally accurate results. -!..... -! M (input) INTEGER, M>= 0 -! The state space dimension (the row dimension of X, Y). -!..... -! N (input) INTEGER, 0 <= N <= M -! The number of data snapshot pairs -! (the number of columns of X and Y). -!..... -! X (input/output) COMPLEX(KIND=WP) M-by-N array -! > On entry, X contains the data snapshot matrix X. It is -! assumed that the column norms of X are in the range of -! the normalized floating point numbers. -! < On exit, the leading K columns of X contain a POD basis, -! i.e. the leading K left singular vectors of the input -! data matrix X, U(:,1:K). All N columns of X contain all -! left singular vectors of the input matrix X. -! See the descriptions of K, Z and W. -!..... -! LDX (input) INTEGER, LDX >= M -! The leading dimension of the array X. -!..... -! Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array -! > On entry, Y contains the data snapshot matrix Y -! < On exit, -! If JOBR == 'R', the leading K columns of Y contain -! the residual vectors for the computed Ritz pairs. -! See the description of RES. -! If JOBR == 'N', Y contains the original input data, -! scaled according to the value of JOBS. -!..... -! LDY (input) INTEGER , LDY >= M -! The leading dimension of the array Y. -!..... -! NRNK (input) INTEGER -! Determines the mode how to compute the numerical rank, -! i.e. how to truncate small singular values of the input -! matrix X. On input, if -! NRNK = -1 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(1) -! This option is recommended. -! NRNK = -2 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(i-1) -! This option is included for R&D purposes. -! It requires highly accurate SVD, which -! may not be feasible. -! The numerical rank can be enforced by using positive -! value of NRNK as follows: -! 0 < NRNK <= N :: at most NRNK largest singular values -! will be used. If the number of the computed nonzero -! singular values is less than NRNK, then only those -! nonzero values will be used and the actually used -! dimension is less than NRNK. The actual number of -! the nonzero singular values is returned in the variable -! K. See the descriptions of TOL and K. -!..... -! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 -! The tolerance for truncating small singular values. -! See the description of NRNK. -!..... -! K (output) INTEGER, 0 <= K <= N -! The dimension of the POD basis for the data snapshot -! matrix X and the number of the computed Ritz pairs. -! The value of K is determined according to the rule set -! by the parameters NRNK and TOL. -! See the descriptions of NRNK and TOL. -!..... -! EIGS (output) COMPLEX(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of EIGS contain -! the computed eigenvalues (Ritz values). -! See the descriptions of K, and Z. -!..... -! Z (workspace/output) COMPLEX(KIND=WP) M-by-N array -! If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) -! is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. -! If JOBZ == 'F', then the Z(:,i)'s are given implicitly as -! the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) -! is an eigenvector corresponding to EIGS(i). The columns -! of W(1:k,1:K) are the computed eigenvectors of the -! K-by-K Rayleigh quotient. -! See the descriptions of EIGS, X and W. -!..... -! LDZ (input) INTEGER , LDZ >= M -! The leading dimension of the array Z. -!..... -! RES (output) REAL(KIND=WP) N-by-1 array -! RES(1:K) contains the residuals for the K computed -! Ritz pairs, -! RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. -! See the description of EIGS and Z. -!..... -! B (output) COMPLEX(KIND=WP) M-by-N array. -! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can -! be used for computing the refined vectors; see further -! details in the provided references. -! If JOBF == 'E', B(1:M,1:K) contains -! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the -! Exact DMD, up to scaling by the inverse eigenvalues. -! If JOBF =='N', then B is not referenced. -! See the descriptions of X, W, K. -!..... -! LDB (input) INTEGER, LDB >= M -! The leading dimension of the array B. -!..... -! W (workspace/output) COMPLEX(KIND=WP) N-by-N array -! On exit, W(1:K,1:K) contains the K computed -! eigenvectors of the matrix Rayleigh quotient. -! The Ritz vectors (returned in Z) are the -! product of X (containing a POD basis for the input -! matrix X) and W. See the descriptions of K, S, X and Z. -! W is also used as a workspace to temporarily store the -! right singular vectors of X. -!..... -! LDW (input) INTEGER, LDW >= N -! The leading dimension of the array W. -!..... -! S (workspace/output) COMPLEX(KIND=WP) N-by-N array -! The array S(1:K,1:K) is used for the matrix Rayleigh -! quotient. This content is overwritten during -! the eigenvalue decomposition by CGEEV. -! See the description of K. -!..... -! LDS (input) INTEGER, LDS >= N -! The leading dimension of the array S. -!..... -! ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array -! ZWORK is used as complex workspace in the complex SVD, as -! specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing -! the eigenvalues of a Rayleigh quotient. -! If the call to CGEDMD is only workspace query, then -! ZWORK(1) contains the minimal complex workspace length and -! ZWORK(2) is the optimal complex workspace length. -! Hence, the length of work is at least 2. -! See the description of LZWORK. -!..... -! LZWORK (input) INTEGER -! The minimal length of the workspace vector ZWORK. -! LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV), -! where LZWORK_CGEEV = MAX( 1, 2*N ) and the minimal -! LZWORK_SVD is calculated as follows -! If WHTSVD == 1 :: CGESVD :: -! LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) -! If WHTSVD == 2 :: CGESDD :: -! LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) -! If WHTSVD == 3 :: CGESVDQ :: -! LZWORK_SVD = obtainable by a query -! If WHTSVD == 4 :: CGEJSV :: -! LZWORK_SVD = obtainable by a query -! If on entry LZWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths and returns them in -! LZWORK(1) and LZWORK(2), respectively. -!..... -! RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array -! On exit, RWORK(1:N) contains the singular values of -! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). -! If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain -! scaling factor RWORK(N+2)/RWORK(N+1) used to scale X -! and Y to avoid overflow in the SVD of X. -! This may be of interest if the scaling option is off -! and as many as possible smallest eigenvalues are -! desired to the highest feasible accuracy. -! If the call to CGEDMD is only workspace query, then -! RWORK(1) contains the minimal workspace length. -! See the description of LRWORK. -!..... -! LRWORK (input) INTEGER -! The minimal length of the workspace vector RWORK. -! LRWORK is calculated as follows: -! LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where -! LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace -! for the SVD subroutine determined by the input parameter -! WHTSVD. -! If WHTSVD == 1 :: CGESVD :: -! LRWORK_SVD = 5*MIN(M,N) -! If WHTSVD == 2 :: CGESDD :: -! LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), -! 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) -! If WHTSVD == 3 :: CGESVDQ :: -! LRWORK_SVD = obtainable by a query -! If WHTSVD == 4 :: CGEJSV :: -! LRWORK_SVD = obtainable by a query -! If on entry LRWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! real workspace length and returns it in RWORK(1). -!..... -! IWORK (workspace/output) INTEGER LIWORK-by-1 array -! Workspace that is required only if WHTSVD equals -! 2 , 3 or 4. (See the description of WHTSVD). -! If on entry LWORK =-1 or LIWORK=-1, then the -! minimal length of IWORK is computed and returned in -! IWORK(1). See the description of LIWORK. -!..... -! LIWORK (input) INTEGER -! The minimal length of the workspace vector IWORK. -! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 -! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) -! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) -! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) -! If on entry LIWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for ZWORK, RWORK and -! IWORK. See the descriptions of ZWORK, RWORK and IWORK. -!..... -! INFO (output) INTEGER -! -i < 0 :: On entry, the i-th argument had an -! illegal value -! = 0 :: Successful return. -! = 1 :: Void input. Quick exit (M=0 or N=0). -! = 2 :: The SVD computation of X did not converge. -! Suggestion: Check the input data and/or -! repeat with different WHTSVD. -! = 3 :: The computation of the eigenvalues did not -! converge. -! = 4 :: If data scaling was requested on input and -! the procedure found inconsistency in the data -! such that for some column index i, -! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set -! to zero if JOBS=='C'. The computation proceeds -! with original or modified data and warning -! flag is set with INFO=4. -!............................................................. -!............................................................. ! Parameters ! ~~~~~~~~~~ REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) - +! ! Local scalars ! ~~~~~~~~~~~~~ REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & @@ -400,7 +554,7 @@ ! Local arrays ! ~~~~~~~~~~~~ REAL(KIND=WP) :: RDUMMY(2) - +! ! External functions (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~ REAL(KIND=WP) CLANGE, SLAMCH, SCNRM2 @@ -408,13 +562,13 @@ INTEGER ICAMAX LOGICAL SISNAN, LSAME EXTERNAL SISNAN, LSAME - +! ! External subroutines (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~~~~ EXTERNAL CAXPY, CGEMM, CSSCAL EXTERNAL CGEEV, CGEJSV, CGESDD, CGESVD, CGESVDQ, & CLACPY, CLASCL, CLASSQ, XERBLA - +! ! Intrinsic functions ! ~~~~~~~~~~~~~~~~~~~ INTRINSIC FLOAT, INT, MAX, SQRT @@ -607,7 +761,8 @@ K = 0 DO i = 1, N !WORK(i) = SCNRM2( M, X(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL CLASSQ( M, X(1,i), 1, SCALE, SSUM ) IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN K = 0 @@ -680,7 +835,8 @@ ! carefully computed using CLASSQ. DO i = 1, N !RWORK(i) = SCNRM2( M, Y(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL CLASSQ( M, Y(1,i), 1, SCALE, SSUM ) IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN K = 0 diff --git a/lapack-netlib/SRC/cgehrd.f b/lapack-netlib/SRC/cgehrd.f index f407f931a..7ba87cc01 100644 --- a/lapack-netlib/SRC/cgehrd.f +++ b/lapack-netlib/SRC/cgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -222,13 +222,19 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'CGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -249,7 +255,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -269,7 +274,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of @@ -345,7 +350,8 @@ * Use unblocked code to reduce the rest of the matrix * CALL CGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cgelq.f b/lapack-netlib/SRC/cgelq.f index ff482bc42..24aaa982e 100644 --- a/lapack-netlib/SRC/cgelq.f +++ b/lapack-netlib/SRC/cgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -295,9 +295,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = SROUNDUP_LWORK(LWREQ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -322,7 +322,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWREQ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) * RETURN * diff --git a/lapack-netlib/SRC/cgelqf.f b/lapack-netlib/SRC/cgelqf.f index 75f5bc960..3847a958a 100644 --- a/lapack-netlib/SRC/cgelqf.f +++ b/lapack-netlib/SRC/cgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -175,9 +176,8 @@ * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'CGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -185,19 +185,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -267,7 +273,7 @@ $ CALL CGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGELQF diff --git a/lapack-netlib/SRC/cgemlq.f b/lapack-netlib/SRC/cgemlq.f index e0cf78bc0..e5b02b669 100644 --- a/lapack-netlib/SRC/cgemlq.f +++ b/lapack-netlib/SRC/cgemlq.f @@ -110,16 +110,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -143,7 +144,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -159,11 +160,13 @@ *> block sizes MB and NB returned by ILAENV, CGELQ will use either *> CLASWLQ (if the matrix is wide-and-short) or CGELQT to compute *> the LQ factorization. -*> This version of CGEMLQ will use either CLAMSWLQ or CGEMLQT to +*> This version of CGEMLQ will use either CLAMSWLQ or CGEMLQT to *> multiply matrix Q by another matrix. *> Further Details in CLAMSWLQ or CGEMLQT. *> \endverbatim *> +*> \ingroup gemlq +*> * ===================================================================== SUBROUTINE CGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -185,11 +188,12 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLAMSWLQ, CGEMLQT, XERBLA @@ -201,7 +205,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -216,6 +220,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -244,12 +255,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( ( LWORK.LT.LWMIN ) .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = REAL( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -261,7 +272,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -274,7 +285,7 @@ $ MB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = REAL( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/cgemqr.f b/lapack-netlib/SRC/cgemqr.f index ea9de146e..0b7dd9dd7 100644 --- a/lapack-netlib/SRC/cgemqr.f +++ b/lapack-netlib/SRC/cgemqr.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -166,6 +167,8 @@ *> *> \endverbatim *> +*> \ingroup gemqr +*> * ===================================================================== SUBROUTINE CGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -187,11 +190,12 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGEMQRT, CLAMTSQR, XERBLA @@ -203,7 +207,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -218,6 +222,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -251,7 +262,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -263,7 +274,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -276,7 +287,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/cgeqlf.f b/lapack-netlib/SRC/cgeqlf.f index 918bbddad..6c67344c5 100644 --- a/lapack-netlib/SRC/cgeqlf.f +++ b/lapack-netlib/SRC/cgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -187,10 +188,11 @@ NB = ILAENV( 1, 'CGEQLF', ' ', M, N, -1, -1 ) LWKOPT = N*NB END IF - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * @@ -277,7 +279,7 @@ IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL CGEQL2( MU, NU, A, LDA, TAU, WORK, IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGEQLF diff --git a/lapack-netlib/SRC/cgeqp3rk.c b/lapack-netlib/SRC/cgeqp3rk.c new file mode 100644 index 000000000..54e7fb140 --- /dev/null +++ b/lapack-netlib/SRC/cgeqp3rk.c @@ -0,0 +1,1071 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + +/* Check if the matrix A contains +Inf or -Inf, set INFO parameter */ +/* to the column number, where the first +/-Inf is found plus N, */ +/* and continue the computation. */ + + *info = *n + kp1; + + } + +/* ================================================================== */ + +/* Quick return if possible for the case when the first */ +/* stopping criterion is satisfied, i.e. KMAX = 0. */ + + if (*kmax == 0) { + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.f; + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + q__1.r = (real) lwkopt, q__1.i = 0.f; + work[1].r = q__1.r, work[1].i = q__1.i; + return 0; + } + +/* ================================================================== */ + + eps = slamch_("Epsilon"); + +/* Adjust ABSTOL */ + + if (*abstol >= 0.f) { + safmin = slamch_("Safe minimum"); +/* Computing MAX */ + r__1 = *abstol, r__2 = safmin * 2.f; + *abstol = f2cmax(r__1,r__2); + } + +/* Adjust RELTOL */ + + if (*reltol >= 0.f) { + *reltol = f2cmax(*reltol,eps); + } + +/* =================================================================== */ + +/* JMAX is the maximum index of the column to be factorized, */ +/* which is also limited by the first stopping criterion KMAX. */ + + jmax = f2cmin(*kmax,minmn); + +/* =================================================================== */ + +/* Quick return if possible for the case when the second or third */ +/* stopping criterion for the whole original matrix is satified, */ +/* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */ +/* (which is ONE <= RELTOL). */ + + if (maxc2nrm <= *abstol || 1.f <= *reltol) { + + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.f; + + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + + q__1.r = (real) lwkopt, q__1.i = 0.f; + work[1].r = q__1.r, work[1].i = q__1.i; + return 0; + } + +/* ================================================================== */ +/* Factorize columns */ +/* ================================================================== */ + +/* Determine the block size. */ + + nbmin = 2; + nx = 0; + + if (nb > 1 && nb < minmn) { + +/* Determine when to cross over from blocked to unblocked code. */ +/* (for N less than NX, unblocked code should be used). */ + +/* Computing MAX */ + i__1 = 0, i__2 = ilaenv_(&c__3, "CGEQP3RK", " ", m, n, &c_n1, &c_n1, ( + ftnlen)8, (ftnlen)1); + nx = f2cmax(i__1,i__2); + + if (nx < minmn) { + +/* Determine if workspace is large enough for blocked code. */ + + if (*lwork < lwkopt) { + +/* Not enough workspace to use optimal block size that */ +/* is currently stored in NB. */ +/* Reduce NB and determine the minimum value of NB. */ + + nb = (*lwork - (*n << 1)) / (*n + 1); +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "CGEQP3RK", " ", m, n, &c_n1, + &c_n1, (ftnlen)8, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + } + } + } + +/* ================================================================== */ + +/* DONE is the boolean flag to rerpresent the case when the */ +/* factorization completed in the block factorization routine, */ +/* before the end of the block. */ + + done = FALSE_; + +/* J is the column index. */ + + j = 1; + +/* (1) Use blocked code initially. */ + +/* JMAXB is the maximum column index of the block, when the */ +/* blocked code is used, is also limited by the first stopping */ +/* criterion KMAX. */ + +/* Computing MIN */ + i__1 = *kmax, i__2 = minmn - nx; + jmaxb = f2cmin(i__1,i__2); + + if (nb >= nbmin && nb < jmax && jmaxb > 0) { + +/* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */ +/* J is the column index of a column block; */ +/* JB is the column block size to pass to block factorization */ +/* routine in a loop step; */ +/* JBF is the number of columns that were actually factorized */ +/* that was returned by the block factorization routine */ +/* in a loop step, JBF <= JB; */ +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + while(j <= jmaxb) { + +/* Computing MIN */ + i__1 = nb, i__2 = jmaxb - j + 1; + jb = f2cmin(i__1,i__2); + n_sub__ = *n - j + 1; + ioffset = j - 1; + +/* Factorize JB columns among the columns A(J:N). */ + + i__1 = *n + *nrhs - j + 1; + claqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, + &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, + maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], & + rwork[*n + j], &work[1], &work[jb + 1], &i__1, &iwork[1], + &iinfo); + +/* Set INFO on the first occurence of Inf. */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } + + if (done) { + +/* Either the submatrix is zero before the end of the */ +/* column block, or ABSTOL or RELTOL criterion is */ +/* satisfied before the end of the column block, we can */ +/* return from the routine. Perform the following before */ +/* returning: */ +/* a) Set the number of factorized columns K, */ +/* K = IOFFSET + JBF from the last call of blocked */ +/* routine. */ +/* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */ +/* by the block factorization routine; */ +/* 2) The remaining TAUs are set to ZERO by the */ +/* block factorization routine. */ + + *k = ioffset + jbf; + +/* Set INFO on the first occurrence of NaN, NaN takes */ +/* prcedence over Inf. */ + + if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + +/* Return from the routine. */ + + q__1.r = (real) lwkopt, q__1.i = 0.f; + work[1].r = q__1.r, work[1].i = q__1.i; + + return 0; + + } + + j += jbf; + + } + + } + +/* Use unblocked code to factor the last or only block. */ +/* J = JMAX+1 means we factorized the maximum possible number of */ +/* columns, that is in ELSE clause we need to compute */ +/* the MAXC2NORM and RELMAXC2NORM to return after we processed */ +/* the blocks. */ + + if (j <= jmax) { + +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + n_sub__ = *n - j + 1; + ioffset = j - 1; + + i__1 = jmax - j + 1; + claqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, & + maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, + relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], &rwork[*n + j], & + work[1], &iinfo); + +/* ABSTOL or RELTOL criterion is satisfied when the number of */ +/* the factorized columns KF is smaller then the number */ +/* of columns JMAX-J+1 supplied to be factorized by the */ +/* unblocked routine, we can return from */ +/* the routine. Perform the following before returning: */ +/* a) Set the number of factorized columns K, */ +/* b) MAXC2NRMK and RELMAXC2NRMK are returned by the */ +/* unblocked factorization routine above. */ + + *k = j - 1 + kf; + +/* Set INFO on the first exception occurence. */ + +/* Set INFO on the first exception occurence of Inf or NaN, */ +/* (NaN takes precedence over Inf). */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } else if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + + } else { + +/* Compute the return values for blocked code. */ + +/* Set the number of factorized columns if the unblocked routine */ +/* was not called. */ + + *k = jmax; + +/* If there exits a residual matrix after the blocked code: */ +/* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */ +/* residual matrix, otherwise set them to ZERO; */ +/* 2) Set TAU(K+1:MINMN) to ZERO. */ + + if (*k < minmn) { + i__1 = *n - *k; + jmaxc2nrm = *k + isamax_(&i__1, &rwork[*k + 1], &c__1); + *maxc2nrmk = rwork[jmaxc2nrm]; + if (*k == 0) { + *relmaxc2nrmk = 1.f; + } else { + *relmaxc2nrmk = *maxc2nrmk / maxc2nrm; + } + + i__1 = minmn; + for (j = *k + 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + + } else { + *maxc2nrmk = 0.f; + *relmaxc2nrmk = 0.f; + + } + +/* END IF( J.LE.JMAX ) THEN */ + + } + + q__1.r = (real) lwkopt, q__1.i = 0.f; + work[1].r = q__1.r, work[1].i = q__1.i; + + return 0; + +/* End of CGEQP3RK */ + +} /* cgeqp3rk_ */ + diff --git a/lapack-netlib/SRC/cgeqp3rk.f b/lapack-netlib/SRC/cgeqp3rk.f new file mode 100644 index 000000000..731c44edb --- /dev/null +++ b/lapack-netlib/SRC/cgeqp3rk.f @@ -0,0 +1,1093 @@ +*> \brief \b CGEQP3RK computes a truncated Householder QR factorization with column pivoting of a complex m-by-n matrix A by using Level 3 BLAS and overwrites m-by-nrhs matrix B with Q**H * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CGEQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, +* $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ WORK, LWORK, RWORK, IWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, K, KMAX, LDA, LWORK, M, N, NRHS +* REAL ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* REAL RWORK( * ) +* COMPLEX A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CGEQP3RK performs two tasks simultaneously: +*> +*> Task 1: The routine computes a truncated (rank K) or full rank +*> Householder QR factorization with column pivoting of a complex +*> M-by-N matrix A using Level 3 BLAS. K is the number of columns +*> that were factorized, i.e. factorization rank of the +*> factor R, K <= min(M,N). +*> +*> A * P(K) = Q(K) * R(K) = +*> +*> = Q(K) * ( R11(K) R12(K) ) = Q(K) * ( R(K)_approx ) +*> ( 0 R22(K) ) ( 0 R(K)_residual ), +*> +*> where: +*> +*> P(K) is an N-by-N permutation matrix; +*> Q(K) is an M-by-M unitary matrix; +*> R(K)_approx = ( R11(K), R12(K) ) is a rank K approximation of the +*> full rank factor R with K-by-K upper-triangular +*> R11(K) and K-by-N rectangular R12(K). The diagonal +*> entries of R11(K) appear in non-increasing order +*> of absolute value, and absolute values of all of +*> them exceed the maximum column 2-norm of R22(K) +*> up to roundoff error. +*> R(K)_residual = R22(K) is the residual of a rank K approximation +*> of the full rank factor R. It is a +*> an (M-K)-by-(N-K) rectangular matrix; +*> 0 is a an (M-K)-by-K zero matrix. +*> +*> Task 2: At the same time, the routine overwrites a complex M-by-NRHS +*> matrix B with Q(K)**H * B using Level 3 BLAS. +*> +*> ===================================================================== +*> +*> The matrices A and B are stored on input in the array A as +*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS) +*> respectively. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> The truncation criteria (i.e. when to stop the factorization) +*> can be any of the following: +*> +*> 1) The input parameter KMAX, the maximum number of columns +*> KMAX to factorize, i.e. the factorization rank is limited +*> to KMAX. If KMAX >= min(M,N), the criterion is not used. +*> +*> 2) The input parameter ABSTOL, the absolute tolerance for +*> the maximum column 2-norm of the residual matrix R22(K). This +*> means that the factorization stops if this norm is less or +*> equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used. +*> +*> 3) The input parameter RELTOL, the tolerance for the maximum +*> column 2-norm matrix of the residual matrix R22(K) divided +*> by the maximum column 2-norm of the original matrix A, which +*> is equal to abs(R(1,1)). This means that the factorization stops +*> when the ratio of the maximum column 2-norm of R22(K) to +*> the maximum column 2-norm of A is less than or equal to RELTOL. +*> If RELTOL < 0.0, the criterion is not used. +*> +*> 4) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix R22(K) is a zero matrix in some +*> factorization step K. ( This stopping criterion is implicit. ) +*> +*> The algorithm stops when any of these conditions is first +*> satisfied, otherwise the whole matrix A is factorized. +*> +*> To factorize the whole matrix A, use the values +*> KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0. +*> +*> The routine returns: +*> a) Q(K), R(K)_approx = ( R11(K), R12(K) ), +*> R(K)_residual = R22(K), P(K), i.e. the resulting matrices +*> of the factorization; P(K) is represented by JPIV, +*> ( if K = min(M,N), R(K)_approx is the full factor R, +*> and there is no residual matrix R(K)_residual); +*> b) K, the number of columns that were factorized, +*> i.e. factorization rank; +*> c) MAXC2NRMK, the maximum column 2-norm of the residual +*> matrix R(K)_residual = R22(K), +*> ( if K = min(M,N), MAXC2NRMK = 0.0 ); +*> d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum +*> column 2-norm of the original matrix A, which is equal +*> to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 ); +*> e) Q(K)**H * B, the matrix B with the unitary +*> transformation Q(K)**H applied on the left. +*> +*> The N-by-N permutation matrix P(K) is stored in a compact form in +*> the integer array JPIV. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The M-by-M unitary matrix Q is represented as a product +*> of elementary Householder reflectors +*> +*> Q(K) = H(1) * H(2) * . . . * H(K), +*> +*> where K is the number of columns that were factorized. +*> +*> Each H(j) has the form +*> +*> H(j) = I - tau * v * v**H, +*> +*> where 1 <= j <= K and +*> I is an M-by-M identity matrix, +*> tau is a complex scalar, +*> v is a complex vector with v(1:j-1) = 0 and v(j) = 1. +*> +*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j). +*> +*> See the Further Details section for more information. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e. the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M,N), then this stopping criterion +*> is not used, the routine factorizes columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B are not modified, and +*> the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL +*> +*> The second factorization stopping criterion, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix R22(K). +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix R22(K) +*> is less than or equal to ABSTOL. Let SAFMIN = DLAMCH('S'). +*> +*> a) If ABSTOL is NaN, then no computation is performed +*> and an error message ( INFO = -5 ) is issued +*> by XERBLA. +*> +*> b) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN +*> is used. This includes the case ABSTOL = -0.0. +*> +*> d) If 2*SAFMIN <= ABSTOL then the input value +*> of ABSTOL is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If ABSTOL chosen above is >= MAXC2NRM, then this +*> stopping criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. The routine +*> returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case ABSTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is REAL +*> +*> The third factorization stopping criterion, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio +*> abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of +*> the residual matrix R22(K) to the maximum column 2-norm of +*> the original matrix A. The algorithm converges (stops the +*> factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less +*> than or equal to RELTOL. Let EPS = DLAMCH('E'). +*> +*> a) If RELTOL is NaN, then no computation is performed +*> and an error message ( INFO = -6 ) is issued +*> by XERBLA. +*> +*> b) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used. +*> This includes the case RELTOL = -0.0. +*> +*> d) If EPS <= RELTOL then the input value of RELTOL +*> is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If RELTOL chosen above is >= 1.0, then this stopping +*> criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. +*> The routine returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case RELTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> +*> NOTE: We recommend that RELTOL satisfy +*> min( 10*max(M,N)*EPS, sqrt(EPS) ) <= RELTOL +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N+NRHS) +*> +*> On entry: +*> +*> a) The subarray A(1:M,1:N) contains the M-by-N matrix A. +*> b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS +*> matrix B. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> +*> a) The subarray A(1:M,1:N) contains parts of the factors +*> of the matrix A: +*> +*> 1) If K = 0, A(1:M,1:N) contains the original matrix A. +*> 2) If K > 0, A(1:M,1:N) contains parts of the +*> factors: +*> +*> 1. The elements below the diagonal of the subarray +*> A(1:M,1:K) together with TAU(1:K) represent the +*> unitary matrix Q(K) as a product of K Householder +*> elementary reflectors. +*> +*> 2. The elements on and above the diagonal of +*> the subarray A(1:K,1:N) contain K-by-N +*> upper-trapezoidal matrix +*> R(K)_approx = ( R11(K), R12(K) ). +*> NOTE: If K=min(M,N), i.e. full rank factorization, +*> then R_approx(K) is the full factor R which +*> is upper-trapezoidal. If, in addition, M>=N, +*> then R is upper-triangular. +*> +*> 3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K) +*> rectangular matrix R(K)_residual = R22(K). +*> +*> b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains +*> the M-by-NRHS product Q(K)**H * B. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> This is the leading dimension for both matrices, A and B. +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> +*> NOTE: If K = 0, a) the arrays A and B are not modified; +*> b) the array TAU(1:min(M,N)) is set to ZERO, +*> if the matrix A does not contain NaN, +*> otherwise the elements TAU(1:min(M,N)) +*> are undefined; +*> c) the elements of the array JPIV are set +*> as follows: for j = 1:N, JPIV(j) = j. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is REAL +*> The maximum column 2-norm of the residual matrix R22(K), +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then MAXC2NRMK equals the maximum column 2-norm +*> of the original matrix A. +*> +*> b) If 0 < K < min(M,N), then MAXC2NRMK is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then MAXC2NRMK = 0.0. +*> +*> NOTE: MAXC2NRMK in the factorization step K would equal +*> R(K+1,K+1) in the next factorization step K+1. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is REAL +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix R22(K) (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then RELMAXC2NRMK = 1.0. +*> +*> b) If 0 < K < min(M,N), then +*> RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then RELMAXC2NRMK = 0.0. +*> +*> NOTE: RELMAXC2NRMK in the factorization step K would equal +*> abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization +*> step K+1. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The elements of the array JPIV(1:N) are always set +*> by the routine, for example, even when no columns +*> were factorized, i.e. when K = 0, the elements are +*> set as JPIV(j) = j for j = 1:N. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX array, dimension (min(M,N)) +*> The scalar factors of the elementary reflectors. +*> +*> If 0 < K <= min(M,N), only the elements TAU(1:K) of +*> the array TAU are modified by the factorization. +*> After the factorization computed, if no NaN was found +*> during the factorization, the remaining elements +*> TAU(K+1:min(M,N)) are set to zero, otherwise the +*> elements TAU(K+1:min(M,N)) are not set and therefore +*> undefined. +*> ( If K = 0, all elements of TAU are set to zero, if +*> the matrix A does not contain NaN. ) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= N+NRHS-1, otherwise. +*> For optimal performance LWORK >= NB*( N+NRHS+1 ), +*> where NB is the optimal block size for CGEQP3RK returned +*> by ILAENV. Minimal block size MINNB=2. +*> +*> NOTE: The decision, whether to use unblocked BLAS 2 +*> or blocked BLAS 3 code is based not only on the dimension +*> LWORK of the availbale workspace WORK, but also also on the +*> matrix A dimension N via crossover point NX returned +*> by ILAENV. (For N less than NX, unblocked code should be +*> used.) +*> +*> If LWORK = -1, then a workspace query is assumed; +*> the routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (2*N) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix in the blocked step auxiliary subroutine CLAQP3RK ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) INFO < 0: if INFO = -i, the i-th argument had an +*> illegal value. +*> 3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup geqp3rk +* +*> \par Further Details: +* ===================== +* +*> \verbatim +*> CGEQP3RK is based on the same BLAS3 Householder QR factorization +*> algorithm with column pivoting as in CGEQP3 routine which uses +*> CLARFG routine to generate Householder reflectors +*> for QR factorization. +*> +*> We can also write: +*> +*> A = A_approx(K) + A_residual(K) +*> +*> The low rank approximation matrix A(K)_approx from +*> the truncated QR factorization of rank K of the matrix A is: +*> +*> A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T +*> ( 0 0 ) +*> +*> = Q(K) * ( R11(K) R12(K) ) * P(K)**T +*> ( 0 0 ) +*> +*> The residual A_residual(K) of the matrix A is: +*> +*> A_residual(K) = Q(K) * ( 0 0 ) * P(K)**T = +*> ( 0 R(K)_residual ) +*> +*> = Q(K) * ( 0 0 ) * P(K)**T +*> ( 0 R22(K) ) +*> +*> The truncated (rank K) factorization guarantees that +*> the maximum column 2-norm of A_residual(K) is less than +*> or equal to MAXC2NRMK up to roundoff error. +*> +*> NOTE: An approximation of the null vectors +*> of A can be easily computed from R11(K) +*> and R12(K): +*> +*> Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) ) +*> ( -I ) +*> +*> \endverbatim +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, + $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ WORK, LWORK, RWORK, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS + REAL ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + REAL RWORK( * ) + COMPLEX A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER INB, INBMIN, IXOVER + PARAMETER ( INB = 1, INBMIN = 2, IXOVER = 3 ) + REAL ZERO, ONE, TWO + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 ) + COMPLEX CZERO + PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, DONE + INTEGER IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX, + $ JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB, + $ NBMIN, NX + REAL EPS, HUGEVAL, MAXC2NRM, SAFMIN +* .. +* .. External Subroutines .. + EXTERNAL CLAQP2RK, CLAQP3RK, XERBLA +* .. +* .. External Functions .. + LOGICAL SISNAN + INTEGER ISAMAX, ILAENV + REAL SLAMCH, SCNRM2, SROUNDUP_LWORK + EXTERNAL SISNAN, SLAMCH, SCNRM2, ISAMAX, ILAENV, + $ SROUNDUP_LWORK +* .. +* .. Intrinsic Functions .. + INTRINSIC CMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test input arguments +* ==================== +* + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( KMAX.LT.0 ) THEN + INFO = -4 + ELSE IF( SISNAN( ABSTOL ) ) THEN + INFO = -5 + ELSE IF( SISNAN( RELTOL ) ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -8 + END IF +* +* If the input parameters M, N, NRHS, KMAX, LDA are valid: +* a) Test the input workspace size LWORK for the minimum +* size requirement IWS. +* b) Determine the optimal block size NB and optimal +* workspace size LWKOPT to be returned in WORK(1) +* in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., +* (3) when routine exits. +* Here, IWS is the miminum workspace required for unblocked +* code. +* + IF( INFO.EQ.0 ) THEN + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + IWS = 1 + LWKOPT = 1 + ELSE +* +* Minimal workspace size in case of using only unblocked +* BLAS 2 code in CLAQP2RK. +* 1) CLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in CLARF subroutine inside CLAQP2RK to apply an +* elementary reflector from the left. +* TOTAL_WORK_SIZE = 3*N + NRHS - 1 +* + IWS = N + NRHS - 1 +* +* Assign to NB optimal block size. +* + NB = ILAENV( INB, 'CGEQP3RK', ' ', M, N, -1, -1 ) +* +* A formula for the optimal workspace size in case of using +* both unblocked BLAS 2 in CLAQP2RK and blocked BLAS 3 code +* in CLAQP3RK. +* 1) CGEQP3RK, CLAQP2RK, CLAQP3RK: 2*N to store full and +* partial column 2-norms. +* 2) CLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in CLARF subroutine to apply an elementary reflector +* from the left. +* 3) CLAQP3RK: NB*(N+NRHS) to use in the work array F that +* is used to apply a block reflector from +* the left. +* 4) CLAQP3RK: NB to use in the auxilixary array AUX. +* Sizes (2) and ((3) + (4)) should intersect, therefore +* TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. +* + LWKOPT = 2*N + NB*( N+NRHS+1 ) + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* + IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN + INFO = -15 + END IF + END IF +* +* NOTE: The optimal workspace size is returned in WORK(1), if +* the input parameters M, N, NRHS, KMAX, LDA are valid. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGEQP3RK', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible for M=0 or N=0. +* + IF( MINMN.EQ.0 ) THEN + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* +* Initialize column pivot array JPIV. +* + DO J = 1, N + JPIV( J ) = J + END DO +* +* ================================================================== +* +* Initialize storage for partial and exact column 2-norms. +* a) The elements WORK(1:N) are used to store partial column +* 2-norms of the matrix A, and may decrease in each computation +* step; initialize to the values of complete columns 2-norms. +* b) The elements WORK(N+1:2*N) are used to store complete column +* 2-norms of the matrix A, they are not changed during the +* computation; initialize the values of complete columns 2-norms. +* + DO J = 1, N + RWORK( J ) = SCNRM2( M, A( 1, J ), 1 ) + RWORK( N+J ) = RWORK( J ) + END DO +* +* ================================================================== +* +* Compute the pivot column index and the maximum column 2-norm +* for the whole original matrix stored in A(1:M,1:N). +* + KP1 = ISAMAX( N, RWORK( 1 ), 1 ) +* +* ==================================================================. +* + IF( SISNAN( MAXC2NRM ) ) THEN +* +* Check if the matrix A contains NaN, set INFO parameter +* to the column number where the first NaN is found and return +* from the routine. +* + K = 0 + INFO = KP1 +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = MAXC2NRM +* +* Array TAU is not set and contains undefined elements. +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN + END IF +* +* =================================================================== +* + IF( MAXC2NRM.EQ.ZERO ) THEN +* +* Check is the matrix A is a zero matrix, set array TAU and +* return from the routine. +* + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO +* + DO J = 1, MINMN + TAU( J ) = CZERO + END DO +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN +* + END IF +* +* =================================================================== +* + HUGEVAL = SLAMCH( 'Overflow' ) +* + IF( MAXC2NRM.GT.HUGEVAL ) THEN +* +* Check if the matrix A contains +Inf or -Inf, set INFO parameter +* to the column number, where the first +/-Inf is found plus N, +* and continue the computation. +* + INFO = N + KP1 +* + END IF +* +* ================================================================== +* +* Quick return if possible for the case when the first +* stopping criterion is satisfied, i.e. KMAX = 0. +* + IF( KMAX.EQ.0 ) THEN + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE + DO J = 1, MINMN + TAU( J ) = CZERO + END DO + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* + EPS = SLAMCH('Epsilon') +* +* Adjust ABSTOL +* + IF( ABSTOL.GE.ZERO ) THEN + SAFMIN = SLAMCH('Safe minimum') + ABSTOL = MAX( ABSTOL, TWO*SAFMIN ) + END IF +* +* Adjust RELTOL +* + IF( RELTOL.GE.ZERO ) THEN + RELTOL = MAX( RELTOL, EPS ) + END IF +* +* =================================================================== +* +* JMAX is the maximum index of the column to be factorized, +* which is also limited by the first stopping criterion KMAX. +* + JMAX = MIN( KMAX, MINMN ) +* +* =================================================================== +* +* Quick return if possible for the case when the second or third +* stopping criterion for the whole original matrix is satified, +* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL +* (which is ONE <= RELTOL). +* + IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN +* + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE +* + DO J = 1, MINMN + TAU( J ) = CZERO + END DO +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* Factorize columns +* ================================================================== +* +* Determine the block size. +* + NBMIN = 2 + NX = 0 +* + IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN +* +* Determine when to cross over from blocked to unblocked code. +* (for N less than NX, unblocked code should be used). +* + NX = MAX( 0, ILAENV( IXOVER, 'CGEQP3RK', ' ', M, N, -1, -1 ) ) +* + IF( NX.LT.MINMN ) THEN +* +* Determine if workspace is large enough for blocked code. +* + IF( LWORK.LT.LWKOPT ) THEN +* +* Not enough workspace to use optimal block size that +* is currently stored in NB. +* Reduce NB and determine the minimum value of NB. +* + NB = ( LWORK-2*N ) / ( N+1 ) + NBMIN = MAX( 2, ILAENV( INBMIN, 'CGEQP3RK', ' ', M, N, + $ -1, -1 ) ) +* + END IF + END IF + END IF +* +* ================================================================== +* +* DONE is the boolean flag to rerpresent the case when the +* factorization completed in the block factorization routine, +* before the end of the block. +* + DONE = .FALSE. +* +* J is the column index. +* + J = 1 +* +* (1) Use blocked code initially. +* +* JMAXB is the maximum column index of the block, when the +* blocked code is used, is also limited by the first stopping +* criterion KMAX. +* + JMAXB = MIN( KMAX, MINMN - NX ) +* + IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN +* +* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: +* J is the column index of a column block; +* JB is the column block size to pass to block factorization +* routine in a loop step; +* JBF is the number of columns that were actually factorized +* that was returned by the block factorization routine +* in a loop step, JBF <= JB; +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + DO WHILE( J.LE.JMAXB ) +* + JB = MIN( NB, JMAXB-J+1 ) + N_SUB = N-J+1 + IOFFSET = J-1 +* +* Factorize JB columns among the columns A(J:N). +* + CALL CLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ DONE, JBF, MAXC2NRMK, RELMAXC2NRMK, + $ JPIV( J ), TAU( J ), + $ RWORK( J ), RWORK( N+J ), + $ WORK( 1 ), WORK( JB+1 ), + $ N+NRHS-J+1, IWORK, IINFO ) +* +* Set INFO on the first occurence of Inf. +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + END IF +* + IF( DONE ) THEN +* +* Either the submatrix is zero before the end of the +* column block, or ABSTOL or RELTOL criterion is +* satisfied before the end of the column block, we can +* return from the routine. Perform the following before +* returning: +* a) Set the number of factorized columns K, +* K = IOFFSET + JBF from the last call of blocked +* routine. +* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned +* by the block factorization routine; +* 2) The remaining TAUs are set to ZERO by the +* block factorization routine. +* + K = IOFFSET + JBF +* +* Set INFO on the first occurrence of NaN, NaN takes +* prcedence over Inf. +* + IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* +* Return from the routine. +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* + RETURN +* + END IF +* + J = J + JBF +* + END DO +* + END IF +* +* Use unblocked code to factor the last or only block. +* J = JMAX+1 means we factorized the maximum possible number of +* columns, that is in ELSE clause we need to compute +* the MAXC2NORM and RELMAXC2NORM to return after we processed +* the blocks. +* + IF( J.LE.JMAX ) THEN +* +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + N_SUB = N-J+1 + IOFFSET = J-1 +* + CALL CLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1, + $ ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ), + $ TAU( J ), RWORK( J ), RWORK( N+J ), + $ WORK( 1 ), IINFO ) +* +* ABSTOL or RELTOL criterion is satisfied when the number of +* the factorized columns KF is smaller then the number +* of columns JMAX-J+1 supplied to be factorized by the +* unblocked routine, we can return from +* the routine. Perform the following before returning: +* a) Set the number of factorized columns K, +* b) MAXC2NRMK and RELMAXC2NRMK are returned by the +* unblocked factorization routine above. +* + K = J - 1 + KF +* +* Set INFO on the first exception occurence. +* +* Set INFO on the first exception occurence of Inf or NaN, +* (NaN takes precedence over Inf). +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* + ELSE +* +* Compute the return values for blocked code. +* +* Set the number of factorized columns if the unblocked routine +* was not called. +* + K = JMAX +* +* If there exits a residual matrix after the blocked code: +* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the +* residual matrix, otherwise set them to ZERO; +* 2) Set TAU(K+1:MINMN) to ZERO. +* + IF( K.LT.MINMN ) THEN + JMAXC2NRM = K + ISAMAX( N-K, RWORK( K+1 ), 1 ) + MAXC2NRMK = RWORK( JMAXC2NRM ) + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + DO J = K + 1, MINMN + TAU( J ) = CZERO + END DO +* + ELSE + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO +* + END IF +* +* END IF( J.LE.JMAX ) THEN +* + END IF +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* + RETURN +* +* End of CGEQP3RK +* + END diff --git a/lapack-netlib/SRC/cgeqr.f b/lapack-netlib/SRC/cgeqr.f index d10e3da65..3617594d0 100644 --- a/lapack-netlib/SRC/cgeqr.f +++ b/lapack-netlib/SRC/cgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE CGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,11 +190,12 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLATSQR, CGEQRT, XERBLA @@ -244,8 +247,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +258,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +273,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +287,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +314,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) * RETURN * diff --git a/lapack-netlib/SRC/cgeqrfp.f b/lapack-netlib/SRC/cgeqrfp.f index eaf98ddf3..5b6226c67 100644 --- a/lapack-netlib/SRC/cgeqrfp.f +++ b/lapack-netlib/SRC/cgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL CGEQR2P, CLARFB, CLARFT, XERBLA @@ -182,8 +183,16 @@ * INFO = 0 NB = ILAENV( 1, 'CGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -191,7 +200,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -203,7 +212,6 @@ * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -211,7 +219,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. @@ -273,7 +281,7 @@ $ CALL CGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGEQRFP diff --git a/lapack-netlib/SRC/cgesvdx.f b/lapack-netlib/SRC/cgesvdx.f index fbdb121ca..e1856a65f 100644 --- a/lapack-netlib/SRC/cgesvdx.f +++ b/lapack-netlib/SRC/cgesvdx.f @@ -208,7 +208,7 @@ *> \param[out] WORK *> \verbatim *> WORK is COMPLEX array, dimension (MAX(1,LWORK)) -*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK; +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK @@ -261,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsing +*> \ingroup gesvdx * * ===================================================================== SUBROUTINE CGESVDX( JOBU, JOBVT, RANGE, M, N, A, LDA, VL, VU, @@ -312,8 +312,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, CLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, CLANGE + REAL SLAMCH, CLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, CLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, SQRT @@ -448,7 +448,7 @@ END IF END IF MAXWRK = MAX( MAXWRK, MINWRK ) - WORK( 1 ) = CMPLX( REAL( MAXWRK ), ZERO ) + WORK( 1 ) = SROUNDUP_LWORK( MAXWRK ) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -19 @@ -464,7 +464,7 @@ * * Quick return if possible * - IF( M.EQ.0 .OR. N.EQ.0 ) THEN + IF( MINMN.EQ.0 ) THEN RETURN END IF * @@ -846,7 +846,7 @@ * * Return optimal workspace in WORK(1) * - WORK( 1 ) = CMPLX( REAL( MAXWRK ), ZERO ) + WORK( 1 ) = SROUNDUP_LWORK( MAXWRK ) * RETURN * diff --git a/lapack-netlib/SRC/cgesvj.f b/lapack-netlib/SRC/cgesvj.f index 149cf5e48..b9c8f1709 100644 --- a/lapack-netlib/SRC/cgesvj.f +++ b/lapack-netlib/SRC/cgesvj.f @@ -208,15 +208,17 @@ *> \verbatim *> CWORK is COMPLEX array, dimension (max(1,LWORK)) *> Used as workspace. -*> If on entry LWORK = -1, then a workspace query is assumed and -*> no computation is done; CWORK(1) is set to the minial (and optimal) -*> length of CWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER. -*> Length of CWORK, LWORK >= M+N. +*> Length of CWORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M+N, otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; CWORK(1) is set to the minial (and optimal) +*> length of CWORK. *> \endverbatim *> *> \param[in,out] RWORK @@ -247,15 +249,17 @@ *> RWORK(6) = the largest absolute value over all sines of the *> Jacobi rotation angles in the last sweep. It can be *> useful for a post festum analysis. -*> If on entry LRWORK = -1, then a workspace query is assumed and -*> no computation is done; RWORK(1) is set to the minial (and optimal) -*> length of RWORK. *> \endverbatim *> *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> Length of RWORK, LRWORK >= MAX(6,N). +*> Length of RWORK. +*> LRWORK >= 1, if MIN(M,N) = 0, and LRWORK >= MAX(6,N), otherwise +*> +*> If on entry LRWORK = -1, then a workspace query is assumed and +*> no computation is done; RWORK(1) is set to the minial (and optimal) +*> length of RWORK. *> \endverbatim *> *> \param[out] INFO @@ -276,7 +280,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -374,16 +378,17 @@ PARAMETER ( NSWEEP = 30 ) * .. * .. Local Scalars .. - COMPLEX AAPQ, OMPQ - REAL AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, - $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, - $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, - $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL - INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, - $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, - $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + COMPLEX AAPQ, OMPQ + REAL AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, + $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, + $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, + $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL + INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, + $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, + $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND, + $ MINMN, LWMIN, LRWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK, + $ RSVEC, UCTOL, UPPER * .. * .. * .. Intrinsic Functions .. @@ -398,8 +403,8 @@ INTEGER ISAMAX EXTERNAL ISAMAX * from LAPACK - REAL SLAMCH - EXTERNAL SLAMCH + REAL SLAMCH, SROUNDUP_LWORK + EXTERNAL SLAMCH, SROUNDUP_LWORK LOGICAL LSAME EXTERNAL LSAME * .. @@ -422,7 +427,16 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * - LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 ) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + LRWMIN = 1 + ELSE + LWMIN = M + N + LRWMIN = MAX( 6, N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -442,9 +456,9 @@ INFO = -11 ELSE IF( UCTOL .AND. ( RWORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( LWORK.LT.( M+N ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 - ELSE IF( LRWORK.LT.MAX( N, 6 ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LRWORK.LT.LRWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -15 ELSE INFO = 0 @@ -454,15 +468,15 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'CGESVJ', -INFO ) RETURN - ELSE IF ( LQUERY ) THEN - CWORK(1) = M + N - RWORK(1) = MAX( N, 6 ) + ELSE IF( LQUERY ) THEN + CWORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/cgetri.f b/lapack-netlib/SRC/cgetri.f index 2060d1444..2eb3da7ab 100644 --- a/lapack-netlib/SRC/cgetri.f +++ b/lapack-netlib/SRC/cgetri.f @@ -153,8 +153,8 @@ * INFO = 0 NB = ILAENV( 1, 'CGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -252,7 +252,7 @@ $ CALL CSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) 60 CONTINUE * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGETRI diff --git a/lapack-netlib/SRC/cgetsls.f b/lapack-netlib/SRC/cgetsls.f index b4bb7562f..3f43dc8de 100644 --- a/lapack-netlib/SRC/cgetsls.f +++ b/lapack-netlib/SRC/cgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -229,7 +229,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEO = 1 + WSIZEM = 1 + ELSE IF ( M.GE.N ) THEN CALL CGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) diff --git a/lapack-netlib/SRC/cgetsqrhrt.f b/lapack-netlib/SRC/cgetsqrhrt.f index 4e4dc1d4a..087e9bc7f 100644 --- a/lapack-netlib/SRC/cgetsqrhrt.f +++ b/lapack-netlib/SRC/cgetsqrhrt.f @@ -131,13 +131,15 @@ *> \param[in] LWORK *> \verbatim *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -160,7 +162,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup comlpexOTHERcomputational +*> \ingroup getsqrhrt * *> \par Contributors: * ================== @@ -200,6 +202,10 @@ INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS * .. +* .. External Functions .. + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL CCOPY, CLATSQR, CUNGTSQR_ROW, CUNHR_COL, $ XERBLA @@ -212,7 +218,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -225,7 +231,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -263,8 +269,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * @@ -277,14 +284,14 @@ CALL XERBLA( 'CGETSQRHRT', -INFO ) RETURN ELSE IF ( LQUERY ) THEN - WORK( 1 ) = CMPLX( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * * Quick return if possible * IF( MIN( M, N ).EQ.0 ) THEN - WORK( 1 ) = CMPLX( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * @@ -341,9 +348,9 @@ END IF END DO * - WORK( 1 ) = CMPLX( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN * * End of CGETSQRHRT * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/cgges3.f b/lapack-netlib/SRC/cgges3.f index aac9f9510..c1ca79688 100644 --- a/lapack-netlib/SRC/cgges3.f +++ b/lapack-netlib/SRC/cgges3.f @@ -215,7 +215,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -260,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup gges3 * * ===================================================================== SUBROUTINE CGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, @@ -300,7 +301,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT + $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT, + $ LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SMLNUM * .. @@ -310,13 +312,12 @@ * .. * .. External Subroutines .. EXTERNAL CGEQRF, CGGBAK, CGGBAL, CGGHD3, CLAQZ0, CLACPY, - $ CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, SLABAD, - $ XERBLA + $ CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, XERBLA * .. * .. External Functions .. LOGICAL LSAME - REAL CLANGE, SLAMCH - EXTERNAL LSAME, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -353,6 +354,8 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -369,7 +372,7 @@ INFO = -14 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -16 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * @@ -377,29 +380,33 @@ * IF( INFO.EQ.0 ) THEN CALL CGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N + INT( WORK( 1 ) ) ) CALL CUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) IF( ILVSL ) THEN CALL CUNGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, $ IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) END IF CALL CGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL, $ LDVSL, VSR, LDVSR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) CALL CLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, WORK, -1, $ RWORK, 0, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) IF( WANTST ) THEN CALL CTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, SDIM, $ PVSL, PVSR, DIF, WORK, -1, IDUM, 1, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF - WORK( 1 ) = CMPLX( LWKOPT ) END IF * @@ -422,7 +429,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -585,7 +591,7 @@ * 30 CONTINUE * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cggev3.f b/lapack-netlib/SRC/cggev3.f index 9483ecdeb..d2b75aebc 100644 --- a/lapack-netlib/SRC/cggev3.f +++ b/lapack-netlib/SRC/cggev3.f @@ -174,7 +174,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -208,7 +209,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup ggev3 * * ===================================================================== SUBROUTINE CGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA, @@ -243,7 +244,7 @@ CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, $ IN, IRIGHT, IROWS, IRWRK, ITAU, IWRK, JC, JR, - $ LWKOPT + $ LWKOPT, LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP COMPLEX X @@ -253,13 +254,12 @@ * .. * .. External Subroutines .. EXTERNAL CGEQRF, CGGBAK, CGGBAL, CGGHD3, CLAQZ0, CLACPY, - $ CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, SLABAD, - $ XERBLA + $ CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, XERBLA * .. * .. External Functions .. LOGICAL LSAME - REAL CLANGE, SLAMCH - EXTERNAL LSAME, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, REAL, SQRT @@ -301,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -315,7 +316,7 @@ INFO = -11 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -15 END IF * @@ -323,7 +324,7 @@ * IF( INFO.EQ.0 ) THEN CALL CGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( N, N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N+INT( WORK( 1 ) ) ) CALL CUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) @@ -348,7 +349,11 @@ $ RWORK, 0, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) END IF - WORK( 1 ) = CMPLX( LWKOPT ) + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + END IF END IF * IF( INFO.NE.0 ) THEN @@ -368,7 +373,6 @@ EPS = SLAMCH( 'E' )*SLAMCH( 'B' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -549,7 +553,7 @@ IF( ILBSCL ) $ CALL CLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR ) * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CGGEV3 diff --git a/lapack-netlib/SRC/cgghd3.f b/lapack-netlib/SRC/cgghd3.f index 1074b4828..f7175a72c 100644 --- a/lapack-netlib/SRC/cgghd3.f +++ b/lapack-netlib/SRC/cgghd3.f @@ -180,14 +180,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -212,7 +212,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup gghd3 * *> \par Further Details: * ===================== @@ -265,7 +265,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGGHRD, CLARTG, CLASET, CUNM22, CROT, CGEMM, @@ -280,8 +281,13 @@ * INFO = 0 NB = ILAENV( 1, 'CGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) - WORK( 1 ) = CMPLX( LWKOPT ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) INITZ = LSAME( COMPZ, 'I' ) @@ -330,7 +336,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = CONE RETURN @@ -888,7 +893,8 @@ IF ( JCOL.LT.IHI ) $ CALL CGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) - WORK( 1 ) = CMPLX( LWKOPT ) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cggqrf.f b/lapack-netlib/SRC/cggqrf.f index 29b0bf4af..309f170e8 100644 --- a/lapack-netlib/SRC/cggqrf.f +++ b/lapack-netlib/SRC/cggqrf.f @@ -251,8 +251,8 @@ NB2 = ILAENV( 1, 'CGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'CUNMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P)*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -288,7 +288,7 @@ * RQ factorization of N-by-P matrix B: B = T*Z. * CALL CGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO ) - WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( MAX( LOPT, INT( WORK( 1 ) ) ) ) * RETURN * diff --git a/lapack-netlib/SRC/cggrqf.f b/lapack-netlib/SRC/cggrqf.f index 273ab3ef7..8470a1ce2 100644 --- a/lapack-netlib/SRC/cggrqf.f +++ b/lapack-netlib/SRC/cggrqf.f @@ -250,8 +250,8 @@ NB2 = ILAENV( 1, 'CGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'CUNMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P)*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -288,7 +288,7 @@ * QR factorization of P-by-N matrix B: B = Z*T * CALL CGEQRF( P, N, B, LDB, TAUB, WORK, LWORK, INFO ) - WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( MAX( LOPT, INT( WORK( 1 ) ) ) ) * RETURN * diff --git a/lapack-netlib/SRC/cggsvd3.f b/lapack-netlib/SRC/cggsvd3.f index f248aebd5..4c4b85bae 100644 --- a/lapack-netlib/SRC/cggsvd3.f +++ b/lapack-netlib/SRC/cggsvd3.f @@ -278,7 +278,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -333,7 +333,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsing +*> \ingroup ggsvd3 * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/cggsvp3.f b/lapack-netlib/SRC/cggsvp3.f index 008a053a2..e19f7efd5 100644 --- a/lapack-netlib/SRC/cggsvp3.f +++ b/lapack-netlib/SRC/cggsvp3.f @@ -233,7 +233,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -256,7 +256,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup ggsvp3 * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/cheevd.f b/lapack-netlib/SRC/cheevd.f index b5ca804eb..9b62a2df6 100644 --- a/lapack-netlib/SRC/cheevd.f +++ b/lapack-netlib/SRC/cheevd.f @@ -116,8 +116,7 @@ *> *> \param[out] RWORK *> \verbatim -*> RWORK is REAL array, -*> dimension (LRWORK) +*> RWORK is REAL array, dimension (MAX(1,LRWORK)) *> On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. *> \endverbatim *> @@ -282,8 +281,8 @@ LROPT = LRWMIN LIOPT = LIWMIN END IF - WORK( 1 ) = SROUNDUP_LWORK(LOPT) - RWORK( 1 ) = LROPT + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LROPT ) IWORK( 1 ) = LIOPT * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -378,8 +377,8 @@ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) END IF * - WORK( 1 ) = SROUNDUP_LWORK(LOPT) - RWORK( 1 ) = LROPT + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LROPT ) IWORK( 1 ) = LIOPT * RETURN diff --git a/lapack-netlib/SRC/cheevr.f b/lapack-netlib/SRC/cheevr.f index 05c5e66be..ad5c8cd4a 100644 --- a/lapack-netlib/SRC/cheevr.f +++ b/lapack-netlib/SRC/cheevr.f @@ -272,7 +272,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,2*N). +*> The length of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 2*N. *> For optimal efficiency, LWORK >= (NB+1)*N, *> where NB is the max of the blocksize for CHETRD and for *> CUNMTR as returned by ILAENV. @@ -294,7 +295,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -313,7 +315,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -417,9 +420,15 @@ LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR. $ ( LIWORK.EQ.-1 ) ) * - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) - LWMIN = MAX( 1, 2*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 2*N + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -454,8 +463,8 @@ NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -483,7 +492,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = REAL( A( 1, 1 ) ) @@ -710,8 +719,8 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/cheevr_2stage.f b/lapack-netlib/SRC/cheevr_2stage.f index 0332a09bc..e06925fcd 100644 --- a/lapack-netlib/SRC/cheevr_2stage.f +++ b/lapack-netlib/SRC/cheevr_2stage.f @@ -265,7 +265,7 @@ *> indicating the nonzero elements in Z. The i-th eigenvector *> is nonzero only in elements ISUPPZ( 2*i-1 ) through *> ISUPPZ( 2*i ). This is an output of CSTEMR (tridiagonal -*> matrix). The support of the eigenvectors of A is typically +*> matrix). The support of the eigenvectors of A is typically *> 1:N because of the unitary transformations applied by CUNMTR. *> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 *> \endverbatim @@ -279,12 +279,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -310,7 +311,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -329,7 +331,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -354,7 +357,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup heevr_2stage * *> \par Contributors: * ================== @@ -382,7 +385,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -390,11 +393,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -443,8 +446,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV, ILAENV2STAGE - REAL SLAMCH, CLANSY - EXTERNAL LSAME, SLAMCH, CLANSY, ILAENV, ILAENV2STAGE + REAL SLAMCH, CLANSY, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, CLANSY, ILAENV, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL, @@ -472,9 +476,16 @@ IB = ILAENV2STAGE( 2, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = N + LHTRD + LWTRD - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = N + LHTRD + LWTRD + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -506,8 +517,8 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -535,7 +546,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = REAL( A( 1, 1 ) ) @@ -643,9 +654,9 @@ * * Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form. * - CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), + CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), $ RWORK( INDRE ), WORK( INDTAU ), - $ WORK( INDHOUS ), LHTRD, + $ WORK( INDHOUS ), LHTRD, $ WORK( INDWK ), LLWORK, IINFO ) * * If all eigenvalues are desired @@ -666,7 +677,7 @@ CALL SCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 ) CALL SCOPY( N, RWORK( INDRD ), 1, RWORK( INDRDD ), 1 ) * - IF (ABSTOL .LE. TWO*N*EPS) THEN + IF ( ABSTOL .LE. TWO*N*EPS ) THEN TRYRAC = .TRUE. ELSE TRYRAC = .FALSE. @@ -765,8 +776,8 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/cheevx.f b/lapack-netlib/SRC/cheevx.f index e91599a44..a8a2bde63 100644 --- a/lapack-netlib/SRC/cheevx.f +++ b/lapack-netlib/SRC/cheevx.f @@ -348,14 +348,14 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWKMIN = 1 - WORK( 1 ) = LWKMIN + LWKOPT = 1 ELSE LWKMIN = 2*N NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) ) - LWKOPT = MAX( 1, ( NB + 1 )*N ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = ( NB + 1 )*N END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -17 diff --git a/lapack-netlib/SRC/chesv_aa.f b/lapack-netlib/SRC/chesv_aa.f index 53ecc0a16..0f41c9332 100644 --- a/lapack-netlib/SRC/chesv_aa.f +++ b/lapack-netlib/SRC/chesv_aa.f @@ -177,7 +177,7 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS + INTEGER LWKMIN, LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS * .. * .. External Functions .. LOGICAL LSAME @@ -197,6 +197,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -207,18 +208,18 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 2*N, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_HETRF = INT( WORK(1) ) + LWKOPT_HETRF = INT( WORK( 1 ) ) CALL CHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_HETRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT_HETRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_HETRF, LWKOPT_HETRS ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -240,7 +241,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f index 12950c4af..05ebd9253 100644 --- a/lapack-netlib/SRC/chesv_aa_2stage.f +++ b/lapack-netlib/SRC/chesv_aa_2stage.f @@ -99,14 +99,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX array, dimension (LTB) +*> TB is COMPLEX array, dimension (MAX(1,LTB)). *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -146,14 +146,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX workspace of size LWORK +*> WORK is COMPLEX workspace of size (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -203,7 +204,7 @@ * * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKMIN, LWKOPT * .. * .. External Functions .. LOGICAL LSAME @@ -225,6 +226,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -233,18 +235,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -254,7 +257,6 @@ RETURN END IF * -* * Compute the factorization A = U**H*T*U or A = L*T*L**H. * CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, @@ -268,7 +270,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chesvx.f b/lapack-netlib/SRC/chesvx.f index c23a35ce7..bdaad55ec 100644 --- a/lapack-netlib/SRC/chesvx.f +++ b/lapack-netlib/SRC/chesvx.f @@ -307,7 +307,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKMIN, LWKOPT, NB REAL ANORM * .. * .. External Functions .. @@ -329,6 +329,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -346,17 +347,17 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 2*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) END IF - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -405,7 +406,7 @@ IF( RCOND.LT.SLAMCH( 'Epsilon' ) ) $ INFO = N + 1 * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chetrd_2stage.f b/lapack-netlib/SRC/chetrd_2stage.f index f5ad35f27..ec7075798 100644 --- a/lapack-netlib/SRC/chetrd_2stage.f +++ b/lapack-netlib/SRC/chetrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download CHETRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download CHETRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * COMPLEX A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the unitary *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is COMPLEX array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is COMPLEX array, dimension (LHOUS2) +*> HOUS2 is COMPLEX array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2=-1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -143,13 +145,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -157,9 +162,9 @@ *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +182,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +207,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +215,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -250,7 +255,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - EXTERNAL LSAME, ILAENV2STAGE + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV2STAGE, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -265,10 +271,13 @@ * KD = ILAENV2STAGE( 1, 'CHETRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'CHETRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'CHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -285,8 +294,8 @@ END IF * IF( INFO.EQ.0 ) THEN - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + HOUS2( 1 ) = SROUNDUP_LWORK( LHMIN ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -309,14 +318,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRD_HE2HB', -INFO ) RETURN END IF - CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRD_HB2ST', -INFO ) @@ -324,8 +333,7 @@ END IF * * - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CHETRD_2STAGE diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F index 3688e40a3..b0d3e45fb 100644 --- a/lapack-netlib/SRC/chetrd_hb2st.F +++ b/lapack-netlib/SRC/chetrd_hb2st.F @@ -132,15 +132,17 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is COMPLEX array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is COMPLEX array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error @@ -152,14 +154,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension LWORK. +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)). +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -262,7 +267,7 @@ INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, + $ NBTILES, TTYPE, TID, NTHREADS, $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SICEV, SIZETAU, LDV, LHMIN, LWMIN @@ -286,7 +291,6 @@ * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -295,9 +299,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -318,8 +327,8 @@ END IF * IF( INFO.EQ.0 ) THEN - HOUS( 1 ) = LHMIN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + HOUS( 1 ) = SROUNDUP_LWORK( LHMIN ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -575,8 +584,7 @@ C END IF 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CHETRD_HB2ST diff --git a/lapack-netlib/SRC/chetrd_he2hb.f b/lapack-netlib/SRC/chetrd_he2hb.f index 090f02100..42e71e0b2 100644 --- a/lapack-netlib/SRC/chetrd_he2hb.f +++ b/lapack-netlib/SRC/chetrd_he2hb.f @@ -123,8 +123,8 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY). +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -294,8 +296,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -314,7 +320,7 @@ CALL XERBLA( 'CHETRD_HE2HB', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN END IF * @@ -507,7 +513,7 @@ END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CHETRD_HE2HB diff --git a/lapack-netlib/SRC/chetrf.f b/lapack-netlib/SRC/chetrf.f index 0c596ffe7..2836e30bc 100644 --- a/lapack-netlib/SRC/chetrf.f +++ b/lapack-netlib/SRC/chetrf.f @@ -107,7 +107,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> \endverbatim *> @@ -228,8 +228,8 @@ * Determine the block size * NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -347,7 +347,7 @@ END IF * 40 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF diff --git a/lapack-netlib/SRC/chetrf_aa.f b/lapack-netlib/SRC/chetrf_aa.f index 0547a4eab..51410a6ed 100644 --- a/lapack-netlib/SRC/chetrf_aa.f +++ b/lapack-netlib/SRC/chetrf_aa.f @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= 2*N. For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -128,7 +130,7 @@ *> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -152,7 +154,7 @@ * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB COMPLEX ALPHA * .. @@ -179,19 +181,26 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.( 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -203,11 +212,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN A( 1, 1 ) = REAL( A( 1, 1 ) ) RETURN END IF @@ -460,7 +469,7 @@ END IF * 20 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF_AA diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.f b/lapack-netlib/SRC/chetrf_aa_2stage.f index 400efdf26..a79343753 100644 --- a/lapack-netlib/SRC/chetrf_aa_2stage.f +++ b/lapack-netlib/SRC/chetrf_aa_2stage.f @@ -87,14 +87,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX array, dimension (LTB) +*> TB is COMPLEX array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX workspace of size LWORK +*> WORK is COMPLEX workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used +*> to select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -188,7 +188,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. @@ -213,9 +214,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -229,10 +230,10 @@ NB = ILAENV( 1, 'CHETRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = SROUNDUP_LWORK( MAX( 1, (3*NB+1)*N ) ) END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N*NB ) ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -241,7 +242,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * diff --git a/lapack-netlib/SRC/chetrf_rk.f b/lapack-netlib/SRC/chetrf_rk.f index ef442c937..a13c740e3 100644 --- a/lapack-netlib/SRC/chetrf_rk.f +++ b/lapack-netlib/SRC/chetrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension ( MAX(1,LWORK) ). +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -311,8 +311,8 @@ * Determine the block size * NB = ILAENV( 1, 'CHETRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -488,7 +488,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF_RK diff --git a/lapack-netlib/SRC/chetrf_rook.f b/lapack-netlib/SRC/chetrf_rook.f index 1593c2edc..df0323520 100644 --- a/lapack-netlib/SRC/chetrf_rook.f +++ b/lapack-netlib/SRC/chetrf_rook.f @@ -122,7 +122,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -264,7 +264,7 @@ * NB = ILAENV( 1, 'CHETRF_ROOK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -387,7 +387,7 @@ END IF * 40 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF_ROOK diff --git a/lapack-netlib/SRC/chetri2.f b/lapack-netlib/SRC/chetri2.f index 2865a6440..f15065ae7 100644 --- a/lapack-netlib/SRC/chetri2.f +++ b/lapack-netlib/SRC/chetri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (N+NB+1)*(NB+3) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE CHETRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -147,7 +147,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHETRI2X, CHETRI, XERBLA @@ -159,9 +160,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +178,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = SROUNDUP_LWORK( MINSIZE ) RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - IF( NBMAX .GE. N ) THEN + IF( NBMAX.GE.N ) THEN CALL CHETRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL CHETRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of CHETRI2 diff --git a/lapack-netlib/SRC/chetri_3.f b/lapack-netlib/SRC/chetri_3.f index deda63598..ccfce5070 100644 --- a/lapack-netlib/SRC/chetri_3.f +++ b/lapack-netlib/SRC/chetri_3.f @@ -119,16 +119,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (N+NB+1)*(NB+3). +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= (N+NB+1)*(NB+3). +*> The length of WORK. +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> -*> If LDWORK = -1, then a workspace query is assumed; +*> If LWORK = -1, then a workspace query is assumed; *> the routine only calculates the optimal size of the optimal *> size of the WORK array, returns this value as the first *> entry of the WORK array, and no error message related to @@ -209,8 +210,13 @@ * * Determine the block size * - NB = MAX( 1, ILAENV( 1, 'CHETRI_3', UPLO, N, -1, -1, -1 ) ) - LWKOPT = ( N+NB+1 ) * ( NB+3 ) + IF( N.EQ.0 ) THEN + LWKOPT = 1 + ELSE + NB = MAX( 1, ILAENV( 1, 'CHETRI_3', UPLO, N, -1, -1, -1 ) ) + LWKOPT = ( N+NB+1 ) * ( NB+3 ) + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 @@ -218,7 +224,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN INFO = -8 END IF * @@ -226,7 +232,6 @@ CALL XERBLA( 'CHETRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -237,7 +242,7 @@ * CALL CHETRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO ) * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chetrs_aa.f b/lapack-netlib/SRC/chetrs_aa.f index 879549106..07179ab92 100644 --- a/lapack-netlib/SRC/chetrs_aa.f +++ b/lapack-netlib/SRC/chetrs_aa.f @@ -105,7 +105,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -151,24 +157,30 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME,SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLACPY, CLACGV, CGTSV, CSWAP, CTRSM, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -179,21 +191,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKMIN ) RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN diff --git a/lapack-netlib/SRC/clamswlq.f b/lapack-netlib/SRC/clamswlq.f index 5daf60bf6..8f474a3ab 100644 --- a/lapack-netlib/SRC/clamswlq.f +++ b/lapack-netlib/SRC/clamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -193,91 +196,100 @@ *> * ===================================================================== SUBROUTINE CLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME REAL SROUNDUP_LWORK EXTERNAL LSAME, SROUNDUP_LWORK +* .. * .. External Subroutines .. - EXTERNAL CTPMLQT, CGEMLQT, XERBLA + EXTERNAL CTPMLQT, CGEMLQT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLAMSWLQ', -INFO ) - WORK(1) = SROUNDUP_LWORK(LW) RETURN - ELSE IF (LQUERY) THEN - WORK(1) = SROUNDUP_LWORK(LW) + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * IF((NB.LE.K).OR.(NB.GE.MAX(M,N,K))) THEN CALL CGEMLQT( SIDE, TRANS, M, N, K, MB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN END IF * @@ -404,7 +416,7 @@ * END IF * - WORK(1) = SROUNDUP_LWORK(LW) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLAMSWLQ diff --git a/lapack-netlib/SRC/clamtsqr.f b/lapack-netlib/SRC/clamtsqr.f index 05021e642..13625087f 100644 --- a/lapack-netlib/SRC/clamtsqr.f +++ b/lapack-netlib/SRC/clamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -195,45 +197,47 @@ *> * ===================================================================== SUBROUTINE CLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME REAL SROUNDUP_LWORK EXTERNAL LSAME, SROUNDUP_LWORK +* .. * .. External Subroutines .. - EXTERNAL CGEMQRT, CTPMQRT, XERBLA + EXTERNAL CGEMQRT, CTPMQRT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -241,11 +245,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -256,38 +266,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = SROUNDUP_LWORK(LW) + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL CGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -412,7 +422,7 @@ * END IF * - WORK(1) = SROUNDUP_LWORK(LW) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLAMTSQR diff --git a/lapack-netlib/SRC/claqp2rk.c b/lapack-netlib/SRC/claqp2rk.c new file mode 100644 index 000000000..4184c5927 --- /dev/null +++ b/lapack-netlib/SRC/claqp2rk.c @@ -0,0 +1,943 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + *info = *n + kk - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL >= ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL >= ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + +/* Set K, the number of factorized columns. */ + + *k = kk - 1; + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. */ + + i__2 = minmnfact; + for (j = kk; j <= i__2; ++j) { + i__3 = j; + tau[i__3].r = 0.f, tau[i__3].i = 0.f; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,KK:N): */ +/* 1) swap the KK-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) copy the KK-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than KK in the next loop step.) */ +/* 3) Save the pivot interchange with the indices relative to the */ +/* the original matrix A, not the block A(1:M,1:N). */ + + if (kp != kk) { + cswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1); + vn1[kp] = vn1[kk]; + vn2[kp] = vn2[kk]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[kk]; + jpiv[kk] = itemp; + } + +/* Generate elementary reflector H(KK) using the column A(I:M,KK), */ +/* if the column has more than one element, otherwise */ +/* the elementary reflector would be an identity matrix, */ +/* and TAU(KK) = CZERO. */ + + if (i__ < *m) { + i__2 = *m - i__ + 1; + clarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], & + c__1, &tau[kk]); + } else { + i__2 = kk; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + +/* Check if TAU(KK) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(KK) for Inf, */ +/* since CLARFG cannot produce TAU(KK) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by CLARFG can contain Inf, which requires */ +/* TAU(KK) to contain NaN. Therefore, this case of generating Inf */ +/* by CLARFG is covered by checking TAU(KK) for NaN. */ + + i__2 = kk; + r__1 = tau[i__2].r; + if (sisnan_(&r__1)) { + i__2 = kk; + taunan = tau[i__2].r; + } else /* if(complicated condition) */ { + r__1 = r_imag(&tau[kk]); + if (sisnan_(&r__1)) { + taunan = r_imag(&tau[kk]); + } else { + taunan = 0.f; + } + } + + if (sisnan_(&taunan)) { + *k = kk - 1; + *info = kk; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = taunan; + *relmaxc2nrmk = taunan; + +/* Array TAU(KK:MINMNFACT) is not set and contains */ +/* undefined elements, except the first element TAU(KK) = NaN. */ + + return 0; + } + +/* Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left. */ +/* ( If M >= N, then at KK = N there is no residual matrix, */ +/* i.e. no columns of A to update, only columns of B. */ +/* If M < N, then at KK = M-IOFFSET, I = M and we have a */ +/* one-row residual matrix in A and the elementary */ +/* reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update */ +/* is needed for the residual matrix in A and the */ +/* right-hand-side-matrix in B. */ +/* Therefore, we update only if */ +/* KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */ +/* condition is satisfied, not only KK < N+NRHS ) */ + + if (kk < minmnupdt) { + i__2 = i__ + kk * a_dim1; + aikk.r = a[i__2].r, aikk.i = a[i__2].i; + i__2 = i__ + kk * a_dim1; + a[i__2].r = 1.f, a[i__2].i = 0.f; + i__2 = *m - i__ + 1; + i__3 = *n + *nrhs - kk; + r_cnjg(&q__1, &tau[kk]); + clarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &q__1, + &a[i__ + (kk + 1) * a_dim1], lda, &work[1]); + i__2 = i__ + kk * a_dim1; + a[i__2].r = aikk.r, a[i__2].i = aikk.i; + } + + if (kk < minmnfact) { + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */ +/* when KK < f2cmin(M-IOFFSET, N). */ + + i__2 = *n; + for (j = kk + 1; j <= i__2; ++j) { + if (vn1[j] != 0.f) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + +/* Computing 2nd power */ + r__1 = c_abs(&a[i__ + j * a_dim1]) / vn1[j]; + temp = 1.f - r__1 * r__1; + temp = f2cmax(temp,0.f); +/* Computing 2nd power */ + r__1 = vn1[j] / vn2[j]; + temp2 = temp * (r__1 * r__1); + if (temp2 <= tol3z) { + +/* Compute the column 2-norm for the partial */ +/* column A(I+1:M,J) by explicitly computing it, */ +/* and store it in both partial 2-norm vector VN1 */ +/* and exact column 2-norm vector VN2. */ + + i__3 = *m - i__; + vn1[j] = scnrm2_(&i__3, &a[i__ + 1 + j * a_dim1], & + c__1); + vn2[j] = vn1[j]; + + } else { + +/* Update the column 2-norm for the partial */ +/* column A(I+1:M,J) by removing one */ +/* element A(I,J) and store it in partial */ +/* 2-norm vector VN1. */ + + vn1[j] *= sqrt(temp); + + } + } + } + + } + +/* End factorization loop */ + + } + +/* If we reached this point, all colunms have been factorized, */ +/* i.e. no condition was triggered to exit the routine. */ +/* Set the number of factorized columns. */ + + *k = *kmax; + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */ +/* we return. */ + + if (*k < minmnfact) { + + i__1 = *n - *k; + jmaxc2nrm = *k + isamax_(&i__1, &vn1[*k + 1], &c__1); + *maxc2nrmk = vn1[jmaxc2nrm]; + + if (*k == 0) { + *relmaxc2nrmk = 1.f; + } else { + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + } + + } else { + *maxc2nrmk = 0.f; + *relmaxc2nrmk = 0.f; + } + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, set TAUs corresponding to the columns that were */ +/* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO. */ + + i__1 = minmnfact; + for (j = *k + 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + + return 0; + +/* End of CLAQP2RK */ + +} /* claqp2rk_ */ + diff --git a/lapack-netlib/SRC/claqp2rk.f b/lapack-netlib/SRC/claqp2rk.f new file mode 100644 index 000000000..6b1db085a --- /dev/null +++ b/lapack-netlib/SRC/claqp2rk.f @@ -0,0 +1,726 @@ +*> \brief \b CLAQP2RK computes truncated QR factorization with column pivoting of a complex matrix block using Level 2 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CLAQP2RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, +* $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, +* $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, +* $ INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS +* REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER JPIV( * ) +* REAL VN1( * ), VN2( * ) +* COMPLEX A( LDA, * ), TAU( * ), WORK( * ) +* $ +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLAQP2RK computes a truncated (rank K) or full rank Householder QR +*> factorization with column pivoting of the complex matrix +*> block A(IOFFSET+1:M,1:N) as +*> +*> A * P(K) = Q(K) * R(K). +*> +*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N) +*> is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides matrix block B +*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**H * B. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M-IOFFSET,N), then this stopping +*> criterion is not used, factorize columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL, cannot be NaN. +*> +*> The second factorization stopping criterion. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is REAL, cannot be NaN. +*> +*> The third factorization stopping criterion. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine CGEQP3RK. 1 <= KP1 <= N_orig_mat. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is REAL +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine CGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:K) below +*> the diagonal together with the array TAU represent +*> the unitary matrix Q(K) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:K) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS). +*> The left part A(IOFFSET+1:M,K+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(K)**H. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is REAL +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is REAL +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is REAL array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is REAL array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (N-1) +*> Used in CLARF subroutine to apply an elementary +*> reflector from the left. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp2rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, + $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, + $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, + $ INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS + REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER JPIV( * ) + REAL VN1( * ), VN2( * ) + COMPLEX A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CZERO, CONE + PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), + $ CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT, + $ MINMNUPDT + REAL HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z + COMPLEX AIKK +* .. +* .. External Subroutines .. + EXTERNAL CLARF, CLARFG, CSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, REAL, CONJG, IMAG, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL SISNAN + INTEGER ISAMAX + REAL SLAMCH, SCNRM2 + EXTERNAL SISNAN, SLAMCH, ISAMAX, SCNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* +* MINMNUPDT is the smallest dimension +* of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which +* contains the submatrices A(IOFFSET+1:M,1:N) and +* B(IOFFSET+1:M,1:NRHS) as column blocks. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + KMAX = MIN( KMAX, MINMNFACT ) + TOL3Z = SQRT( SLAMCH( 'Epsilon' ) ) + HUGEVAL = SLAMCH( 'Overflow' ) +* +* Compute the factorization, KK is the lomn loop index. +* + DO KK = 1, KMAX +* + I = IOFFSET + KK +* + IF( I.EQ.1 ) THEN +* +* ============================================================ +* +* We are at the first column of the original whole matrix A, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + KP = KP1 +* +* ============================================================ +* + ELSE +* +* ============================================================ +* +* Determine the pivot column in KK-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( KK-1 ) + ISAMAX( N-KK+1, VN1( KK ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,KK:N) in step KK. +* RELMAXC2NRMK will be computed later, after somecondition +* checks on MAXC2NRMK. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains NaN, and set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( SISNAN( MAXC2NRMK ) ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + INFO = K + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* Array TAU(K+1:MINMNFACT) is not set and contains +* undefined elements. +* + RETURN + END IF +* +* ============================================================ +* +* Quick return, if the submatrix A(I:M,KK:N) is +* a zero matrix. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + RELMAXC2NRMK = ZERO +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + KK - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third stopping criteria. +* NOTE: There is no need to test for ABSTOL >= ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL >= ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. + + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* +* Set K, the number of factorized columns. +* + K = KK - 1 +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,KK:N): +* 1) swap the KK-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) copy the KK-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than KK in the next loop step.) +* 3) Save the pivot interchange with the indices relative to the +* the original matrix A, not the block A(1:M,1:N). +* + IF( KP.NE.KK ) THEN + CALL CSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 ) + VN1( KP ) = VN1( KK ) + VN2( KP ) = VN2( KK ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( KK ) + JPIV( KK ) = ITEMP + END IF +* +* Generate elementary reflector H(KK) using the column A(I:M,KK), +* if the column has more than one element, otherwise +* the elementary reflector would be an identity matrix, +* and TAU(KK) = CZERO. +* + IF( I.LT.M ) THEN + CALL CLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1, + $ TAU( KK ) ) + ELSE + TAU( KK ) = CZERO + END IF +* +* Check if TAU(KK) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(KK) for Inf, +* since CLARFG cannot produce TAU(KK) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by CLARFG can contain Inf, which requires +* TAU(KK) to contain NaN. Therefore, this case of generating Inf +* by CLARFG is covered by checking TAU(KK) for NaN. +* + IF( SISNAN( REAL( TAU(KK) ) ) ) THEN + TAUNAN = REAL( TAU(KK) ) + ELSE IF( SISNAN( IMAG( TAU(KK) ) ) ) THEN + TAUNAN = IMAG( TAU(KK) ) + ELSE + TAUNAN = ZERO + END IF +* + IF( SISNAN( TAUNAN ) ) THEN + K = KK - 1 + INFO = KK +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAUNAN + RELMAXC2NRMK = TAUNAN +* +* Array TAU(KK:MINMNFACT) is not set and contains +* undefined elements, except the first element TAU(KK) = NaN. +* + RETURN + END IF +* +* Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left. +* ( If M >= N, then at KK = N there is no residual matrix, +* i.e. no columns of A to update, only columns of B. +* If M < N, then at KK = M-IOFFSET, I = M and we have a +* one-row residual matrix in A and the elementary +* reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update +* is needed for the residual matrix in A and the +* right-hand-side-matrix in B. +* Therefore, we update only if +* KK < MINMNUPDT = min(M-IOFFSET, N+NRHS) +* condition is satisfied, not only KK < N+NRHS ) +* + IF( KK.LT.MINMNUPDT ) THEN + AIKK = A( I, KK ) + A( I, KK ) = CONE + CALL CLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1, + $ CONJG( TAU( KK ) ), A( I, KK+1 ), LDA, + $ WORK( 1 ) ) + A( I, KK ) = AIKK + END IF +* + IF( KK.LT.MINMNFACT ) THEN +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. +* when KK < min(M-IOFFSET, N). +* + DO J = KK + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2 + TEMP = MAX( TEMP, ZERO ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2 .LE. TOL3Z ) THEN +* +* Compute the column 2-norm for the partial +* column A(I+1:M,J) by explicitly computing it, +* and store it in both partial 2-norm vector VN1 +* and exact column 2-norm vector VN2. +* + VN1( J ) = SCNRM2( M-I, A( I+1, J ), 1 ) + VN2( J ) = VN1( J ) +* + ELSE +* +* Update the column 2-norm for the partial +* column A(I+1:M,J) by removing one +* element A(I,J) and store it in partial +* 2-norm vector VN1. +* + VN1( J ) = VN1( J )*SQRT( TEMP ) +* + END IF + END IF + END DO +* + END IF +* +* End factorization loop +* + END DO +* +* If we reached this point, all colunms have been factorized, +* i.e. no condition was triggered to exit the routine. +* Set the number of factorized columns. +* + K = KMAX +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before +* we return. +* + IF( K.LT.MINMNFACT ) THEN +* + JMAXC2NRM = K + ISAMAX( N-K, VN1( K+1 ), 1 ) + MAXC2NRMK = VN1( JMAXC2NRM ) +* + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + ELSE + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + END IF +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, set TAUs corresponding to the columns that were +* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO. +* + DO J = K + 1, MINMNFACT + TAU( J ) = CZERO + END DO +* + RETURN +* +* End of CLAQP2RK +* + END diff --git a/lapack-netlib/SRC/claqp3rk.c b/lapack-netlib/SRC/claqp3rk.c new file mode 100644 index 000000000..ca305fab7 --- /dev/null +++ b/lapack-netlib/SRC/claqp3rk.c @@ -0,0 +1,1152 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, + kb, &q__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* Quick return, if the submatrix A(I:M,K:N) is */ +/* a zero matrix. We need to check it only if the column index */ +/* (same as row index) is larger than 1, since the condition */ +/* for the whole original matrix A_orig is checked in the main */ +/* routine. */ + + if (*maxc2nrmk == 0.f) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *relmaxc2nrmk = 0.f; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix is zero and we stop the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, + kb, &q__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* Check if the submatrix A(I:M,K:N) contains Inf, */ +/* set INFO parameter to the column number, where */ +/* the first Inf is found plus N, and continue */ +/* the computation. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + if (*info == 0 && *maxc2nrmk > myhugeval) { + *info = *n + k - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third tolerance stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL.GE.ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig; */ + + *kb = k - 1; + if__ = i__ - 1; + +/* Apply the block reflector to the residual of the */ +/* matrix A and the residual of the right hand sides B, if */ +/* the residual matrix and and/or the residual of the right */ +/* hand sides exist, i.e. if the submatrix */ +/* A(I+1:M,KB+1:N+NRHS) exists. This occurs when */ +/* KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("No transpose", "Conjugate transpose", &i__1, & + i__2, kb, &q__1, &a[if__ + 1 + a_dim1], lda, &f[* + kb + 1 + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*kb + + 1) * a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,K:N): */ +/* 1) swap the K-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */ +/* 3) copy the K-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. (Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than K in the next loop step.) */ +/* 4) Save the pivot interchange with the indices relative to the */ +/* the original matrix A_orig, not the block A(1:M,1:N). */ + + if (kp != k) { + cswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + i__1 = k - 1; + cswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf); + vn1[kp] = vn1[k]; + vn2[kp] = vn2[k]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[k]; + jpiv[k] = itemp; + } + +/* Apply previous Householder reflectors to column K: */ +/* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H. */ + + if (k > 1) { + i__1 = k - 1; + for (j = 1; j <= i__1; ++j) { + i__2 = k + j * f_dim1; + r_cnjg(&q__1, &f[k + j * f_dim1]); + f[i__2].r = q__1.r, f[i__2].i = q__1.i; + } + i__1 = *m - i__ + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = 0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[i__ + a_dim1], lda, + &f[k + f_dim1], ldf, &c_b2, &a[i__ + k * a_dim1], &c__1); + i__1 = k - 1; + for (j = 1; j <= i__1; ++j) { + i__2 = k + j * f_dim1; + r_cnjg(&q__1, &f[k + j * f_dim1]); + f[i__2].r = q__1.r, f[i__2].i = q__1.i; + } + } + +/* Generate elementary reflector H(k) using the column A(I:M,K). */ + + if (i__ < *m) { + i__1 = *m - i__ + 1; + clarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], & + c__1, &tau[k]); + } else { + i__1 = k; + tau[i__1].r = 0.f, tau[i__1].i = 0.f; + } + +/* Check if TAU(K) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(K) for Inf, */ +/* since CLARFG cannot produce TAU(KK) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by CLARFG can contain Inf, which requires */ +/* TAU(K) to contain NaN. Therefore, this case of generating Inf */ +/* by CLARFG is covered by checking TAU(K) for NaN. */ + + i__1 = k; + r__1 = tau[i__1].r; + if (sisnan_(&r__1)) { + i__1 = k; + taunan = tau[i__1].r; + } else /* if(complicated condition) */ { + r__1 = r_imag(&tau[k]); + if (sisnan_(&r__1)) { + taunan = r_imag(&tau[k]); + } else { + taunan = 0.f; + } + } + + if (sisnan_(&taunan)) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *info = k; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = taunan; + *relmaxc2nrmk = taunan; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix contains NaN and we stop */ +/* the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, kb, + &q__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * a_dim1], + lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* =============================================================== */ + + i__1 = i__ + k * a_dim1; + aik.r = a[i__1].r, aik.i = a[i__1].i; + i__1 = i__ + k * a_dim1; + a[i__1].r = 1.f, a[i__1].i = 0.f; + +/* =============================================================== */ + +/* Compute the current K-th column of F: */ +/* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K). */ + + if (k < *n + *nrhs) { + i__1 = *m - i__ + 1; + i__2 = *n + *nrhs - k; + cgemv_("Conjugate transpose", &i__1, &i__2, &tau[k], &a[i__ + (k + + 1) * a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, & + f[k + 1 + k * f_dim1], &c__1); + } + +/* 2) Zero out elements above and on the diagonal of the */ +/* column K in matrix F, i.e elements F(1:K,K). */ + + i__1 = k; + for (j = 1; j <= i__1; ++j) { + i__2 = j + k * f_dim1; + f[i__2].r = 0.f, f[i__2].i = 0.f; + } + +/* 3) Incremental updating of the K-th column of F: */ +/* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H */ +/* * A(I:M,K). */ + + if (k > 1) { + i__1 = *m - i__ + 1; + i__2 = k - 1; + i__3 = k; + q__1.r = -tau[i__3].r, q__1.i = -tau[i__3].i; + cgemv_("Conjugate Transpose", &i__1, &i__2, &q__1, &a[i__ + + a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, &auxv[1] + , &c__1); + + i__1 = *n + *nrhs; + i__2 = k - 1; + cgemv_("No transpose", &i__1, &i__2, &c_b2, &f[f_dim1 + 1], ldf, & + auxv[1], &c__1, &c_b2, &f[k * f_dim1 + 1], &c__1); + } + +/* =============================================================== */ + +/* Update the current I-th row of A: */ +/* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */ +/* - A(I,1:K)*F(K+1:N+NRHS,1:K)**H. */ + + if (k < *n + *nrhs) { + i__1 = *n + *nrhs - k; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("No transpose", "Conjugate transpose", &c__1, &i__1, &k, & + q__1, &a[i__ + a_dim1], lda, &f[k + 1 + f_dim1], ldf, & + c_b2, &a[i__ + (k + 1) * a_dim1], lda); + } + + i__1 = i__ + k * a_dim1; + a[i__1].r = aik.r, a[i__1].i = aik.i; + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */ +/* when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */ + + if (k < minmnfact) { + + i__1 = *n; + for (j = k + 1; j <= i__1; ++j) { + if (vn1[j] != 0.f) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + + temp = c_abs(&a[i__ + j * a_dim1]) / vn1[j]; +/* Computing MAX */ + r__1 = 0.f, r__2 = (temp + 1.f) * (1.f - temp); + temp = f2cmax(r__1,r__2); +/* Computing 2nd power */ + r__1 = vn1[j] / vn2[j]; + temp2 = temp * (r__1 * r__1); + if (temp2 <= tol3z) { + +/* At J-index, we have a difficult column for the */ +/* update of the 2-norm. Save the index of the previous */ +/* difficult column in IWORK(J-1). */ +/* NOTE: ILSTCC > 1, threfore we can use IWORK only */ +/* with N-1 elements, where the elements are */ +/* shifted by 1 to the left. */ + + iwork[j - 1] = lsticc; + +/* Set the index of the last difficult column LSTICC. */ + + lsticc = j; + + } else { + vn1[j] *= sqrt(temp); + } + } + } + + } + +/* End of while loop. */ + + } + +/* Now, afler the loop: */ +/* Set KB, the number of factorized columns in the block; */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig, IF = IOFFSET + KB. */ + + *kb = k; + if__ = i__; + +/* Apply the block reflector to the residual of the matrix A */ +/* and the residual of the right hand sides B, if the residual */ +/* matrix and and/or the residual of the right hand sides */ +/* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */ +/* This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("No transpose", "Conjugate transpose", &i__1, &i__2, kb, &q__1, + &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b2, + &a[if__ + 1 + (*kb + 1) * a_dim1], lda); + } + +/* Recompute the 2-norm of the difficult columns. */ +/* Loop over the index of the difficult columns from the largest */ +/* to the smallest index. */ + + while(lsticc > 0) { + +/* LSTICC is the index of the last difficult column is greater */ +/* than 1. */ +/* ITEMP is the index of the previous difficult column. */ + + itemp = iwork[lsticc - 1]; + +/* Compute the 2-norm explicilty for the last difficult column and */ +/* save it in the partial and exact 2-norm vectors VN1 and VN2. */ + +/* NOTE: The computation of VN1( LSTICC ) relies on the fact that */ +/* SCNRM2 does not fail on vectors with norm below the value of */ +/* SQRT(SLAMCH('S')) */ + + i__1 = *m - if__; + vn1[lsticc] = scnrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1); + vn2[lsticc] = vn1[lsticc]; + +/* Downdate the index of the last difficult column to */ +/* the index of the previous difficult column. */ + + lsticc = itemp; + + } + + return 0; + +/* End of CLAQP3RK */ + +} /* claqp3rk_ */ + diff --git a/lapack-netlib/SRC/claqp3rk.f b/lapack-netlib/SRC/claqp3rk.f new file mode 100644 index 000000000..3703bcbd6 --- /dev/null +++ b/lapack-netlib/SRC/claqp3rk.f @@ -0,0 +1,947 @@ +*> \brief \b CLAQP3RK computes a step of truncated QR factorization with column pivoting of a complex m-by-n matrix A using Level 3 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CLAQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, +* $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, +* $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) +* IMPLICIT NONE +* LOGICAL DONE +* INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, +* $ NB, NRHS +* REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* REAL VN1( * ), VN2( * ) +* COMPLEX*16 A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLAQP3RK computes a step of truncated QR factorization with column +*> pivoting of a complex M-by-N matrix A block A(IOFFSET+1:M,1:N) +*> by using Level 3 BLAS as +*> +*> A * P(KB) = Q(KB) * R(KB). +*> +*> The routine tries to factorize NB columns from A starting from +*> the row IOFFSET+1 and updates the residual matrix with BLAS 3 +*> xGEMM. The number of actually factorized columns is returned +*> is smaller than NB. +*> +*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides B matrix stored +*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**H * B. +*> +*> Cases when the number of factorized columns KB < NB: +*> +*> (1) In some cases, due to catastrophic cancellations, it cannot +*> factorize all NB columns and need to update the residual matrix. +*> Hence, the actual number of factorized columns in the block returned +*> in KB is smaller than NB. The logical DONE is returned as FALSE. +*> The factorization of the whole original matrix A_orig must proceed +*> with the next block. +*> +*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. +*> +*> (3) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix is a zero matrix in some factorization +*> step KB, the factorization of the whole original matrix A_orig is +*> stopped, the logical DONE is returned as TRUE. The number of +*> factorized columns which is smaller than NB is returned in KB. +*> +*> (4) Whenever NaN is detected in the matrix A or in the array TAU, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. The INFO +*> parameter is set to the column index of the first NaN occurrence. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0 +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> Factorization block size, i.e the number of columns +*> to factorize in the matrix A. 0 <= NB +*> +*> If NB = 0, then the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is REAL, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine CGEQP3RK. 1 <= KP1 <= N_orig. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is REAL +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine CGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:KB) below +*> the diagonal together with the array TAU represent +*> the unitary matrix Q(KB) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:KB) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS). +*> The left part A(IOFFSET+1:M,KB+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(KB)**H. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] +*> \verbatim +*> DONE is LOGICAL +*> TRUE: a) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to ABSTOL +*> or RELTOL criterion, +*> b) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to the +*> residual matrix being a ZERO matrix. +*> c) when NaN was detected in the matrix A +*> or in the array TAU. +*> FALSE: otherwise. +*> \endverbatim +*> +*> \param[out] KB +*> \verbatim +*> KB is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= KB <= min(M-IOFFSET,NB,N). +*> +*> KB also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is REAL +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank KB. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is REAL +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank KB) to the maximum column 2-norm of the +*> original matrix A_orig. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is REAL array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is REAL array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] AUXV +*> \verbatim +*> AUXV is COMPLEX array, dimension (NB) +*> Auxiliary vector. +*> \endverbatim +*> +*> \param[out] F +*> \verbatim +*> F is COMPLEX array, dimension (LDF,NB) +*> Matrix F**H = L*(Y**H)*A. +*> \endverbatim +*> +*> \param[in] LDF +*> \verbatim +*> LDF is INTEGER +*> The leading dimension of the array F. LDF >= max(1,N+NRHS). +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step KB+1 ( when KB columns +*> have been factorized ). +*> +*> On exit: +*> KB is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(KB+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=KB+1, TAU(KB+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the actorization +*> step KB+1 ( when KB columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp3rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, + $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + LOGICAL DONE + INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, + $ NB, NRHS + REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + REAL VN1( * ), VN2( * ) + COMPLEX A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CZERO, CONE + PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), + $ CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER ITEMP, J, K, MINMNFACT, MINMNUPDT, + $ LSTICC, KP, I, IF + REAL HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z + COMPLEX AIK +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CGEMV, CLARFG, CSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, REAL, CONJG, IMAG, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL SISNAN + INTEGER ISAMAX + REAL SLAMCH, SCNRM2 + EXTERNAL SISNAN, SLAMCH, ISAMAX, SCNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + NB = MIN( NB, MINMNFACT ) + TOL3Z = SQRT( SLAMCH( 'Epsilon' ) ) + HUGEVAL = SLAMCH( 'Overflow' ) +* +* Compute factorization in a while loop over NB columns, +* K is the column index in the block A(1:M,1:N). +* + K = 0 + LSTICC = 0 + DONE = .FALSE. +* + DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 ) + K = K + 1 + I = IOFFSET + K +* + IF( I.EQ.1 ) THEN +* +* We are at the first column of the original whole matrix A_orig, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + KP = KP1 +* + ELSE +* +* Determine the pivot column in K-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( K-1 ) + ISAMAX( N-K+1, VN1( K ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,K:N) in step K. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains NaN, set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( SISNAN( MAXC2NRMK ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = KB + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. + + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* Quick return, if the submatrix A(I:M,K:N) is +* a zero matrix. We need to check it only if the column index +* (same as row index) is larger than 1, since the condition +* for the whole original matrix A_orig is checked in the main +* routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + RELMAXC2NRMK = ZERO +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix is zero and we stop the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. +* + DO J = K, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + K - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third tolerance stopping criteria. +* NOTE: There is no need to test for ABSTOL.GE.ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig; +* + KB = K - 1 + IF = I - 1 +* +* Apply the block reflector to the residual of the +* matrix A and the residual of the right hand sides B, if +* the residual matrix and and/or the residual of the right +* hand sides exist, i.e. if the submatrix +* A(I+1:M,KB+1:N+NRHS) exists. This occurs when +* KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, N+NRHS-KB, KB,-CONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. +* + DO J = K, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,K:N): +* 1) swap the K-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) +* 3) copy the K-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. (Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than K in the next loop step.) +* 4) Save the pivot interchange with the indices relative to the +* the original matrix A_orig, not the block A(1:M,1:N). +* + IF( KP.NE.K ) THEN + CALL CSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 ) + CALL CSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF ) + VN1( KP ) = VN1( K ) + VN2( KP ) = VN2( K ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( K ) + JPIV( K ) = ITEMP + END IF +* +* Apply previous Householder reflectors to column K: +* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H. +* + IF( K.GT.1 ) THEN + DO J = 1, K - 1 + F( K, J ) = CONJG( F( K, J ) ) + END DO + CALL CGEMV( 'No transpose', M-I+1, K-1, -CONE, A( I, 1 ), + $ LDA, F( K, 1 ), LDF, CONE, A( I, K ), 1 ) + DO J = 1, K - 1 + F( K, J ) = CONJG( F( K, J ) ) + END DO + END IF +* +* Generate elementary reflector H(k) using the column A(I:M,K). +* + IF( I.LT.M ) THEN + CALL CLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) ) + ELSE + TAU( K ) = CZERO + END IF +* +* Check if TAU(K) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(K) for Inf, +* since CLARFG cannot produce TAU(KK) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by CLARFG can contain Inf, which requires +* TAU(K) to contain NaN. Therefore, this case of generating Inf +* by CLARFG is covered by checking TAU(K) for NaN. +* + IF( SISNAN( REAL( TAU(K) ) ) ) THEN + TAUNAN = REAL( TAU(K) ) + ELSE IF( SISNAN( IMAG( TAU(K) ) ) ) THEN + TAUNAN = IMAG( TAU(K) ) + ELSE + TAUNAN = ZERO + END IF +* + IF( SISNAN( TAUNAN ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = K +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAUNAN + RELMAXC2NRMK = TAUNAN +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* =============================================================== +* + AIK = A( I, K ) + A( I, K ) = CONE +* +* =============================================================== +* +* Compute the current K-th column of F: +* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K). +* + IF( K.LT.N+NRHS ) THEN + CALL CGEMV( 'Conjugate transpose', M-I+1, N+NRHS-K, + $ TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1, + $ CZERO, F( K+1, K ), 1 ) + END IF +* +* 2) Zero out elements above and on the diagonal of the +* column K in matrix F, i.e elements F(1:K,K). +* + DO J = 1, K + F( J, K ) = CZERO + END DO +* +* 3) Incremental updating of the K-th column of F: +* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H +* * A(I:M,K). +* + IF( K.GT.1 ) THEN + CALL CGEMV( 'Conjugate Transpose', M-I+1, K-1, -TAU( K ), + $ A( I, 1 ), LDA, A( I, K ), 1, CZERO, + $ AUXV( 1 ), 1 ) +* + CALL CGEMV( 'No transpose', N+NRHS, K-1, CONE, + $ F( 1, 1 ), LDF, AUXV( 1 ), 1, CONE, + $ F( 1, K ), 1 ) + END IF +* +* =============================================================== +* +* Update the current I-th row of A: +* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) +* - A(I,1:K)*F(K+1:N+NRHS,1:K)**H. +* + IF( K.LT.N+NRHS ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ 1, N+NRHS-K, K, -CONE, A( I, 1 ), LDA, + $ F( K+1, 1 ), LDF, CONE, A( I, K+1 ), LDA ) + END IF +* + A( I, K ) = AIK +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. +* when K < MINMNFACT = min( M-IOFFSET, N ). +* + IF( K.LT.MINMNFACT ) THEN +* + DO J = K + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ABS( A( I, J ) ) / VN1( J ) + TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2.LE.TOL3Z ) THEN +* +* At J-index, we have a difficult column for the +* update of the 2-norm. Save the index of the previous +* difficult column in IWORK(J-1). +* NOTE: ILSTCC > 1, threfore we can use IWORK only +* with N-1 elements, where the elements are +* shifted by 1 to the left. +* + IWORK( J-1 ) = LSTICC +* +* Set the index of the last difficult column LSTICC. +* + LSTICC = J +* + ELSE + VN1( J ) = VN1( J )*SQRT( TEMP ) + END IF + END IF + END DO +* + END IF +* +* End of while loop. +* + END DO +* +* Now, afler the loop: +* Set KB, the number of factorized columns in the block; +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig, IF = IOFFSET + KB. +* + KB = K + IF = I +* +* Apply the block reflector to the residual of the matrix A +* and the residual of the right hand sides B, if the residual +* matrix and and/or the residual of the right hand sides +* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. +* This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, N+NRHS-KB, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* Recompute the 2-norm of the difficult columns. +* Loop over the index of the difficult columns from the largest +* to the smallest index. +* + DO WHILE( LSTICC.GT.0 ) +* +* LSTICC is the index of the last difficult column is greater +* than 1. +* ITEMP is the index of the previous difficult column. +* + ITEMP = IWORK( LSTICC-1 ) +* +* Compute the 2-norm explicilty for the last difficult column and +* save it in the partial and exact 2-norm vectors VN1 and VN2. +* +* NOTE: The computation of VN1( LSTICC ) relies on the fact that +* SCNRM2 does not fail on vectors with norm below the value of +* SQRT(SLAMCH('S')) +* + VN1( LSTICC ) = SCNRM2( M-IF, A( IF+1, LSTICC ), 1 ) + VN2( LSTICC ) = VN1( LSTICC ) +* +* Downdate the index of the last difficult column to +* the index of the previous difficult column. +* + LSTICC = ITEMP +* + END DO +* + RETURN +* +* End of CLAQP3RK +* + END diff --git a/lapack-netlib/SRC/claswlq.f b/lapack-netlib/SRC/claswlq.f index 12e8373df..2044e055c 100644 --- a/lapack-netlib/SRC/claswlq.f +++ b/lapack-netlib/SRC/claswlq.f @@ -96,22 +96,24 @@ *> The leading dimension of the array T. LDT >= MB. *> \endverbatim *> -*> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= MB*M. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -163,33 +165,35 @@ *> * ===================================================================== SUBROUTINE CLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), T( LDT, *) + COMPLEX A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME INTEGER ILAENV REAL SROUNDUP_LWORK EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL CGELQT, CTPLQT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -200,12 +204,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LE.0 ) THEN INFO = -4 @@ -213,60 +224,61 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = SROUNDUP_LWORK(MB*M) +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN CALL CGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL CGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL CGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M , (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL CTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + CALL CTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), $ LDA, T(1,CTR*M+1), $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL CTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), $ LDA, T(1,CTR*M+1), LDT, $ WORK, INFO ) - END IF + END IF * - WORK( 1 ) = SROUNDUP_LWORK(M * MB) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLASWLQ diff --git a/lapack-netlib/SRC/clatrs3.f b/lapack-netlib/SRC/clatrs3.f index 0502f6898..354141a8b 100644 --- a/lapack-netlib/SRC/clatrs3.f +++ b/lapack-netlib/SRC/clatrs3.f @@ -152,13 +152,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK). +*> WORK is REAL array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal size of *> WORK. *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -166,6 +170,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -182,7 +187,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -257,15 +262,16 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, CLANGE, SLARMM - EXTERNAL ILAENV, LSAME, SLAMCH, CLANGE, SLARMM + REAL SLAMCH, CLANGE, SLARMM, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SLAMCH, CLANGE, SLARMM, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLATRS, CSSCAL, XERBLA @@ -296,15 +302,24 @@ * row. WORK( I + KK * LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * * Test the input parameters. * @@ -326,7 +341,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN @@ -659,6 +674,9 @@ END IF END DO END DO +* + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) +* RETURN * * End of CLATRS3 diff --git a/lapack-netlib/SRC/clatsqr.f b/lapack-netlib/SRC/clatsqr.f index cd2cb4aa7..67403693f 100644 --- a/lapack-netlib/SRC/clatsqr.f +++ b/lapack-netlib/SRC/clatsqr.f @@ -101,15 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= NB*N. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -165,32 +168,34 @@ *> * ===================================================================== SUBROUTINE CLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), T(LDT, *) + COMPLEX A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, LWMIN, MINMN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME REAL SROUNDUP_LWORK EXTERNAL LSAME, SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. - EXTERNAL CGEQRT, CTPQRT, XERBLA + EXTERNAL CGEQRT, CTPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -201,6 +206,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -208,64 +220,65 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = SROUNDUP_LWORK(NB*N) +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL CGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + IF ( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL CGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL CGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) - CTR = 1 + CALL CGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = MB+1, II-MB+N , (MB-N) + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL CTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + CALL CTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, $ T(1,CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL CTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + IF( II.LE.M ) THEN + CALL CTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, $ T(1, CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + $ WORK, INFO ) + END IF * - WORK( 1 ) = SROUNDUP_LWORK(N*NB) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLATSQR diff --git a/lapack-netlib/SRC/dbbcsd.f b/lapack-netlib/SRC/dbbcsd.f index 4fcf9ab5d..913f96a73 100644 --- a/lapack-netlib/SRC/dbbcsd.f +++ b/lapack-netlib/SRC/dbbcsd.f @@ -805,7 +805,7 @@ CALL DLARTGP( B22BULGE, B22E(I-1), WORK(IU2SN+I-1), $ WORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL DLARTGS( B21E(I), B21E(I+1), NU, WORK(IU2CS+I-1), + CALL DLARTGS( B21E(I), B21D(I+1), NU, WORK(IU2CS+I-1), $ WORK(IU2SN+I-1) ) ELSE CALL DLARTGS( B22D(I), B22E(I), MU, WORK(IU2CS+I-1), diff --git a/lapack-netlib/SRC/dgebrd.f b/lapack-netlib/SRC/dgebrd.f index 0f0d1651a..ac11d48a0 100644 --- a/lapack-netlib/SRC/dgebrd.f +++ b/lapack-netlib/SRC/dgebrd.f @@ -122,7 +122,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -147,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gebrd * *> \par Further Details: * ===================== @@ -223,8 +224,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL DGEBD2, DGEMM, DLABRD, XERBLA @@ -241,9 +242,17 @@ * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + ENDIF WORK( 1 ) = DBLE( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -251,7 +260,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -263,7 +272,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -282,7 +290,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using diff --git a/lapack-netlib/SRC/dgedmd.f90 b/lapack-netlib/SRC/dgedmd.f90 index 20424808f..15df48fe9 100644 --- a/lapack-netlib/SRC/dgedmd.f90 +++ b/lapack-netlib/SRC/dgedmd.f90 @@ -1,424 +1,574 @@ - SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & - M, N, X, LDX, Y, LDY, NRNK, TOL, & - K, REIG, IMEIG, Z, LDZ, RES, & - B, LDB, W, LDW, S, LDS, & - WORK, LWORK, IWORK, LIWORK, INFO ) -! March 2023 +!> \brief \b DGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== +! +! SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, REIG, IMEIG, Z, LDZ, RES, & +! B, LDB, W, LDW, S, LDS, & +! WORK, LWORK, IWORK, LIWORK, INFO ) +! !..... - USE iso_fortran_env - IMPLICIT NONE - INTEGER, PARAMETER :: WP = real64 +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real64 !..... ! Scalar arguments - CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF - INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & - NRNK, LDZ, LDB, LDW, LDS, & - LWORK, LIWORK - INTEGER, INTENT(OUT) :: K, INFO - REAL(KIND=WP), INTENT(IN) :: TOL +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LWORK, LIWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL ! Array arguments - REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) - REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & - W(LDW,*), S(LDS,*) - REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & - RES(*) - REAL(KIND=WP), INTENT(OUT) :: WORK(*) - INTEGER, INTENT(OUT) :: IWORK(*) -!............................................................ -! Purpose -! ======= -! DGEDMD computes the Dynamic Mode Decomposition (DMD) for -! a pair of data snapshot matrices. For the input matrices -! X and Y such that Y = A*X with an unaccessible matrix -! A, DGEDMD computes a certain number of Ritz pairs of A using -! the standard Rayleigh-Ritz extraction from a subspace of -! range(X) that is determined using the leading left singular -! vectors of X. Optionally, DGEDMD returns the residuals -! of the computed Ritz pairs, the information needed for -! a refinement of the Ritz vectors, or the eigenvectors of -! the Exact DMD. -! For further details see the references listed -! below. For more details of the implementation see [3]. -! -! References -! ========== -! [1] P. Schmid: Dynamic mode decomposition of numerical -! and experimental data, -! Journal of Fluid Mechanics 656, 5-28, 2010. -! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal -! decompositions: analysis and enhancements, -! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. -! [3] Z. Drmac: A LAPACK implementation of the Dynamic -! Mode Decomposition I. Technical report. AIMDyn Inc. -! and LAPACK Working Note 298. -! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. -! Brunton, N. Kutz: On Dynamic Mode Decomposition: -! Theory and Applications, Journal of Computational -! Dynamics 1(2), 391 -421, 2014. +! REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & +! RES(*) +! REAL(KIND=WP), INTENT(OUT) :: WORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) ! -!...................................................................... -! Developed and supported by: -! =========================== -! Developed and coded by Zlatko Drmac, Faculty of Science, -! University of Zagreb; drmac@math.hr -! In cooperation with -! AIMdyn Inc., Santa Barbara, CA. -! and supported by -! - DARPA SBIR project "Koopman Operator-Based Forecasting -! for Nonstationary Processes from Near-Term, Limited -! Observational Data" Contract No: W31P4Q-21-C-0007 -! - DARPA PAI project "Physics-Informed Machine Learning -! Methodologies" Contract No: HR0011-18-9-0033 -! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic -! Framework for Space-Time Analysis of Process Dynamics" -! Contract No: HR0011-16-C-0116 -! Any opinions, findings and conclusions or recommendations -! expressed in this material are those of the author and -! do not necessarily reflect the views of the DARPA SBIR -! Program Office -!============================================================ -! Distribution Statement A: -! Approved for Public Release, Distribution Unlimited. -! Cleared by DARPA on September 29, 2022 -!============================================================ !............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> DGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, DGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, DGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim +!...................................................................... +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim +!...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim +!...................................................................... ! Arguments ! ========= -! JOBS (input) CHARACTER*1 -! Determines whether the initial data snapshots are scaled -! by a diagonal matrix. -! 'S' :: The data snapshots matrices X and Y are multiplied -! with a diagonal matrix D so that X*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'C' :: The snapshots are scaled as with the 'S' option. -! If it is found that an i-th column of X is zero -! vector and the corresponding i-th column of Y is -! non-zero, then the i-th column of Y is set to -! zero and a warning flag is raised. -! 'Y' :: The data snapshots matrices X and Y are multiplied -! by a diagonal matrix D so that Y*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'N' :: No data scaling. +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) is CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim !..... -! JOBZ (input) CHARACTER*1 -! Determines whether the eigenvectors (Koopman modes) will -! be computed. -! 'V' :: The eigenvectors (Koopman modes) will be computed -! and returned in the matrix Z. -! See the description of Z. -! 'F' :: The eigenvectors (Koopman modes) will be returned -! in factored form as the product X(:,1:K)*W, where X -! contains a POD basis (leading left singular vectors -! of the data matrix X) and W contains the eigenvectors -! of the corresponding Rayleigh quotient. -! See the descriptions of K, X, W, Z. -! 'N' :: The eigenvectors are not computed. +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim !..... -! JOBR (input) CHARACTER*1 -! Determines whether to compute the residuals. -! 'R' :: The residuals for the computed eigenpairs will be -! computed and stored in the array RES. -! See the description of RES. -! For this option to be legal, JOBZ must be 'V'. -! 'N' :: The residuals are not computed. +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim !..... -! JOBF (input) CHARACTER*1 -! Specifies whether to store information needed for post- -! processing (e.g. computing refined Ritz vectors) -! 'R' :: The matrix needed for the refinement of the Ritz -! vectors is computed and stored in the array B. -! See the description of B. -! 'E' :: The unscaled eigenvectors of the Exact DMD are -! computed and returned in the array B. See the -! description of B. -! 'N' :: No eigenvector refinement data is computed. +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim !..... -! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } -! Allows for a selection of the SVD algorithm from the -! LAPACK library. -! 1 :: DGESVD (the QR SVD algorithm) -! 2 :: DGESDD (the Divide and Conquer algorithm; if enough -! workspace available, this is the fastest option) -! 3 :: DGESVDQ (the preconditioned QR SVD ; this and 4 -! are the most accurate options) -! 4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 -! are the most accurate options) -! For the four methods above, a significant difference in -! the accuracy of small singular values is possible if -! the snapshots vary in norm so that X is severely -! ill-conditioned. If small (smaller than EPS*||X||) -! singular values are of interest and JOBS=='N', then -! the options (3, 4) give the most accurate results, where -! the option 4 is slightly better and with stronger -! theoretical background. -! If JOBS=='S', i.e. the columns of X will be normalized, -! then all methods give nearly equally accurate results. +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: DGESVD (the QR SVD algorithm) +!> 2 :: DGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: DGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim !..... -! M (input) INTEGER, M>= 0 -! The state space dimension (the row dimension of X, Y). +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim !..... -! N (input) INTEGER, 0 <= N <= M -! The number of data snapshot pairs -! (the number of columns of X and Y). +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim !..... -! X (input/output) REAL(KIND=WP) M-by-N array -! > On entry, X contains the data snapshot matrix X. It is -! assumed that the column norms of X are in the range of -! the normalized floating point numbers. -! < On exit, the leading K columns of X contain a POD basis, -! i.e. the leading K left singular vectors of the input -! data matrix X, U(:,1:K). All N columns of X contain all -! left singular vectors of the input matrix X. -! See the descriptions of K, Z and W. +!> \param[in,out] X +!> \verbatim +!> X (input/output) REAL(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. +!> \endverbatim !..... -! LDX (input) INTEGER, LDX >= M -! The leading dimension of the array X. +!> \param[in] LDX +!> \verbatim +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim !..... -! Y (input/workspace/output) REAL(KIND=WP) M-by-N array -! > On entry, Y contains the data snapshot matrix Y -! < On exit, -! If JOBR == 'R', the leading K columns of Y contain -! the residual vectors for the computed Ritz pairs. -! See the description of RES. -! If JOBR == 'N', Y contains the original input data, -! scaled according to the value of JOBS. +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) REAL(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim !..... -! LDY (input) INTEGER , LDY >= M -! The leading dimension of the array Y. +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim !..... -! NRNK (input) INTEGER -! Determines the mode how to compute the numerical rank, -! i.e. how to truncate small singular values of the input -! matrix X. On input, if -! NRNK = -1 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(1). -! This option is recommended. -! NRNK = -2 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(i-1) -! This option is included for R&D purposes. -! It requires highly accurate SVD, which -! may not be feasible. -! -! The numerical rank can be enforced by using positive -! value of NRNK as follows: -! 0 < NRNK <= N :: at most NRNK largest singular values -! will be used. If the number of the computed nonzero -! singular values is less than NRNK, then only those -! nonzero values will be used and the actually used -! dimension is less than NRNK. The actual number of -! the nonzero singular values is returned in the variable -! K. See the descriptions of TOL and K. +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1). +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim !..... -! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 -! The tolerance for truncating small singular values. -! See the description of NRNK. +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim !..... -! K (output) INTEGER, 0 <= K <= N -! The dimension of the POD basis for the data snapshot -! matrix X and the number of the computed Ritz pairs. -! The value of K is determined according to the rule set -! by the parameters NRNK and TOL. -! See the descriptions of NRNK and TOL. +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim !..... -! REIG (output) REAL(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of REIG contain -! the real parts of the computed eigenvalues -! REIG(1:K) + sqrt(-1)*IMEIG(1:K). -! See the descriptions of K, IMEIG, and Z. +!> \param[out] REIG +!> \verbatim +!> REIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of REIG contain +!> the real parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> See the descriptions of K, IMEIG, and Z. +!> \endverbatim !..... -! IMEIG (output) REAL(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of IMEIG contain -! the imaginary parts of the computed eigenvalues -! REIG(1:K) + sqrt(-1)*IMEIG(1:K). -! The eigenvalues are determined as follows: -! If IMEIG(i) == 0, then the corresponding eigenvalue is -! real, LAMBDA(i) = REIG(i). -! If IMEIG(i)>0, then the corresponding complex -! conjugate pair of eigenvalues reads -! LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) -! LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) -! That is, complex conjugate pairs have consecutive -! indices (i,i+1), with the positive imaginary part -! listed first. -! See the descriptions of K, REIG, and Z. +!> \param[out] IMEIG +!> \verbatim +!> IMEIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of IMEIG contain +!> the imaginary parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> The eigenvalues are determined as follows: +!> If IMEIG(i) == 0, then the corresponding eigenvalue is +!> real, LAMBDA(i) = REIG(i). +!> If IMEIG(i)>0, then the corresponding complex +!> conjugate pair of eigenvalues reads +!> LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) +!> LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) +!> That is, complex conjugate pairs have consecutive +!> indices (i,i+1), with the positive imaginary part +!> listed first. +!> See the descriptions of K, REIG, and Z. +!> \endverbatim !..... -! Z (workspace/output) REAL(KIND=WP) M-by-N array -! If JOBZ =='V' then -! Z contains real Ritz vectors as follows: -! If IMEIG(i)=0, then Z(:,i) is an eigenvector of -! the i-th Ritz value; ||Z(:,i)||_2=1. -! If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then -! [Z(:,i) Z(:,i+1)] span an invariant subspace and -! the Ritz values extracted from this subspace are -! REIG(i) + sqrt(-1)*IMEIG(i) and -! REIG(i) - sqrt(-1)*IMEIG(i). -! The corresponding eigenvectors are -! Z(:,i) + sqrt(-1)*Z(:,i+1) and -! Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. -! || Z(:,i:i+1)||_F = 1. -! If JOBZ == 'F', then the above descriptions hold for -! the columns of X(:,1:K)*W(1:K,1:K), where the columns -! of W(1:k,1:K) are the computed eigenvectors of the -! K-by-K Rayleigh quotient. The columns of W(1:K,1:K) -! are similarly structured: If IMEIG(i) == 0 then -! X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 -! then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and -! X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) -! are the eigenvectors of LAMBDA(i), LAMBDA(i+1). -! See the descriptions of REIG, IMEIG, X and W. +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) REAL(KIND=WP) M-by-N array +!> If JOBZ =='V' then +!> Z contains real Ritz vectors as follows: +!> If IMEIG(i)=0, then Z(:,i) is an eigenvector of +!> the i-th Ritz value; ||Z(:,i)||_2=1. +!> If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then +!> [Z(:,i) Z(:,i+1)] span an invariant subspace and +!> the Ritz values extracted from this subspace are +!> REIG(i) + sqrt(-1)*IMEIG(i) and +!> REIG(i) - sqrt(-1)*IMEIG(i). +!> The corresponding eigenvectors are +!> Z(:,i) + sqrt(-1)*Z(:,i+1) and +!> Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. +!> || Z(:,i:i+1)||_F = 1. +!> If JOBZ == 'F', then the above descriptions hold for +!> the columns of X(:,1:K)*W(1:K,1:K), where the columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. The columns of W(1:K,1:K) +!> are similarly structured: If IMEIG(i) == 0 then +!> X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 +!> then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and +!> X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) +!> are the eigenvectors of LAMBDA(i), LAMBDA(i+1). +!> See the descriptions of REIG, IMEIG, X and W. +!> \endverbatim !..... -! LDZ (input) INTEGER , LDZ >= M -! The leading dimension of the array Z. +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim !..... -! RES (output) REAL(KIND=WP) N-by-1 array -! RES(1:K) contains the residuals for the K computed -! Ritz pairs. -! If LAMBDA(i) is real, then -! RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. -! If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair -! then -! RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F -! where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] -! [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. -! It holds that -! RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 -! RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 -! where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) -! ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) -! See the description of REIG, IMEIG and Z. +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs. +!> If LAMBDA(i) is real, then +!> RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. +!> If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair +!> then +!> RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F +!> where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] +!> [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. +!> It holds that +!> RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 +!> RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 +!> where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) +!> ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) +!> See the description of REIG, IMEIG and Z. +!> \endverbatim !..... -! B (output) REAL(KIND=WP) M-by-N array. -! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can -! be used for computing the refined vectors; see further -! details in the provided references. -! If JOBF == 'E', B(1:M,1;K) contains -! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the -! Exact DMD, up to scaling by the inverse eigenvalues. -! If JOBF =='N', then B is not referenced. -! See the descriptions of X, W, K. +!> \param[out] B +!> \verbatim +!> B (output) REAL(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1;K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim !..... -! LDB (input) INTEGER, LDB >= M -! The leading dimension of the array B. +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim !..... -! W (workspace/output) REAL(KIND=WP) N-by-N array -! On exit, W(1:K,1:K) contains the K computed -! eigenvectors of the matrix Rayleigh quotient (real and -! imaginary parts for each complex conjugate pair of the -! eigenvalues). The Ritz vectors (returned in Z) are the -! product of X (containing a POD basis for the input -! matrix X) and W. See the descriptions of K, S, X and Z. -! W is also used as a workspace to temporarily store the -! right singular vectors of X. +!> \param[out] W +!> \verbatim +!> W (workspace/output) REAL(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient (real and +!> imaginary parts for each complex conjugate pair of the +!> eigenvalues). The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> right singular vectors of X. +!> \endverbatim !..... -! LDW (input) INTEGER, LDW >= N -! The leading dimension of the array W. +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim !..... -! S (workspace/output) REAL(KIND=WP) N-by-N array -! The array S(1:K,1:K) is used for the matrix Rayleigh -! quotient. This content is overwritten during -! the eigenvalue decomposition by DGEEV. -! See the description of K. +!> \param[out] S +!> \verbatim +!> S (workspace/output) REAL(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by DGEEV. +!> See the description of K. +!> \endverbatim !..... -! LDS (input) INTEGER, LDS >= N -! The leading dimension of the array S. +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim !..... -! WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array -! On exit, WORK(1:N) contains the singular values of -! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). -! If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain -! scaling factor WORK(N+2)/WORK(N+1) used to scale X -! and Y to avoid overflow in the SVD of X. -! This may be of interest if the scaling option is off -! and as many as possible smallest eigenvalues are -! desired to the highest feasible accuracy. -! If the call to DGEDMD is only workspace query, then -! WORK(1) contains the minimal workspace length and -! WORK(2) is the optimal workspace length. Hence, the -! leng of work is at least 2. -! See the description of LWORK. +!> \param[out] WORK +!> \verbatim +!> WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +!> On exit, WORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain +!> scaling factor WORK(N+2)/WORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to DGEDMD is only workspace query, then +!> WORK(1) contains the minimal workspace length and +!> WORK(2) is the optimal workspace length. Hence, the +!> leng of work is at least 2. +!> See the description of LWORK. +!> \endverbatim !..... -! LWORK (input) INTEGER -! The minimal length of the workspace vector WORK. -! LWORK is calculated as follows: -! If WHTSVD == 1 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). -! If JOBZ == 'N' then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). -! Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal -! workspace length of DGESVD. -! If WHTSVD == 2 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) -! Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the -! minimal workspace length of DGESDD. -! If WHTSVD == 3 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) -! Here LWORK_SVD = N+M+MAX(3*N+1, -! MAX(1,3*N+M,5*N),MAX(1,N)) -! is the minimal workspace length of DGESVDQ. -! If WHTSVD == 4 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) -! Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the -! minimal workspace length of DGEJSV. -! The above expressions are not simplified in order to -! make the usage of WORK more transparent, and for -! easier checking. In any case, LWORK >= 2. -! If on entry LWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for both WORK and -! IWORK. See the descriptions of WORK and IWORK. +!> \param[in] LWORK +!> \verbatim +!> LWORK (input) INTEGER +!> The minimal length of the workspace vector WORK. +!> LWORK is calculated as follows: +!> If WHTSVD == 1 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). +!> If JOBZ == 'N' then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). +!> Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal +!> workspace length of DGESVD. +!> If WHTSVD == 2 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the +!> minimal workspace length of DGESDD. +!> If WHTSVD == 3 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = N+M+MAX(3*N+1, +!> MAX(1,3*N+M,5*N),MAX(1,N)) +!> is the minimal workspace length of DGESVDQ. +!> If WHTSVD == 4 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the +!> minimal workspace length of DGEJSV. +!> The above expressions are not simplified in order to +!> make the usage of WORK more transparent, and for +!> easier checking. In any case, LWORK >= 2. +!> If on entry LWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim !..... -! IWORK (workspace/output) INTEGER LIWORK-by-1 array -! Workspace that is required only if WHTSVD equals -! 2 , 3 or 4. (See the description of WHTSVD). -! If on entry LWORK =-1 or LIWORK=-1, then the -! minimal length of IWORK is computed and returned in -! IWORK(1). See the description of LIWORK. +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim !..... -! LIWORK (input) INTEGER -! The minimal length of the workspace vector IWORK. -! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 -! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) -! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) -! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) -! If on entry LIWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for both WORK and -! IWORK. See the descriptions of WORK and IWORK. +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim !..... -! INFO (output) INTEGER -! -i < 0 :: On entry, the i-th argument had an -! illegal value -! = 0 :: Successful return. -! = 1 :: Void input. Quick exit (M=0 or N=0). -! = 2 :: The SVD computation of X did not converge. -! Suggestion: Check the input data and/or -! repeat with different WHTSVD. -! = 3 :: The computation of the eigenvalues did not -! converge. -! = 4 :: If data scaling was requested on input and -! the procedure found inconsistency in the data -! such that for some column index i, -! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set -! to zero if JOBS=='C'. The computation proceeds -! with original or modified data and warning -! flag is set with INFO=4. +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! !............................................................. !............................................................. + SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, REIG, IMEIG, Z, LDZ, RES, & + B, LDB, W, LDW, S, LDS, & + WORK, LWORK, IWORK, LIWORK, INFO ) +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real64 +! +! Scalar arguments +! ~~~~~~~~~~~~~~~~ + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LWORK, LIWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! +! Array arguments +! ~~~~~~~~~~~~~~~ + REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & + RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +! ! Parameters ! ~~~~~~~~~~ REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP - +! ! Local scalars ! ~~~~~~~~~~~~~ REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & @@ -432,10 +582,11 @@ WNTEX, WNTREF, WNTRES, WNTVEC CHARACTER :: JOBZL, T_OR_N CHARACTER :: JSVOPT - +! ! Local arrays ! ~~~~~~~~~~~~ REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2) +! ! External functions (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~ REAL(KIND=WP) DLANGE, DLAMCH, DNRM2 @@ -443,13 +594,13 @@ INTEGER IDAMAX LOGICAL DISNAN, LSAME EXTERNAL DISNAN, LSAME - +! ! External subroutines (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~~~~ EXTERNAL DAXPY, DGEMM, DSCAL EXTERNAL DGEEV, DGEJSV, DGESDD, DGESVD, DGESVDQ, & DLACPY, DLASCL, DLASSQ, XERBLA - +! ! Intrinsic functions ! ~~~~~~~~~~~~~~~~~~~ INTRINSIC DBLE, INT, MAX, SQRT @@ -632,7 +783,8 @@ K = 0 DO i = 1, N !WORK(i) = DNRM2( M, X(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL DLASSQ( M, X(1,i), 1, SCALE, SSUM ) IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN K = 0 @@ -705,7 +857,8 @@ ! carefully computed using DLASSQ. DO i = 1, N !WORK(i) = DNRM2( M, Y(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL DLASSQ( M, Y(1,i), 1, SCALE, SSUM ) IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN K = 0 @@ -1051,4 +1204,3 @@ RETURN ! ...... END SUBROUTINE DGEDMD - diff --git a/lapack-netlib/SRC/dgehrd.f b/lapack-netlib/SRC/dgehrd.f index a40c61cb6..d95bbd182 100644 --- a/lapack-netlib/SRC/dgehrd.f +++ b/lapack-netlib/SRC/dgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gehrd * *> \par Further Details: * ===================== @@ -173,7 +173,7 @@ INTEGER IHI, ILO, INFO, LDA, LWORK, N * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), TAU( * ), WORK( * ) + DOUBLE PRECISION A( LDA, * ), TAU( * ), WORK( * ) * .. * * ===================================================================== @@ -182,7 +182,7 @@ INTEGER NBMAX, LDT, TSIZE PARAMETER ( NBMAX = 64, LDT = NBMAX+1, $ TSIZE = LDT*NBMAX ) - DOUBLE PRECISION ZERO, ONE + DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, $ ONE = 1.0D+0 ) * .. @@ -190,7 +190,7 @@ LOGICAL LQUERY INTEGER I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB, $ NBMIN, NH, NX - DOUBLE PRECISION EI + DOUBLE PRECISION EI * .. * .. External Subroutines .. EXTERNAL DAXPY, DGEHD2, DGEMM, DLAHR2, DLARFB, DTRMM, @@ -221,12 +221,18 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + ENDIF WORK( 1 ) = LWKOPT END IF * @@ -248,7 +254,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -268,7 +273,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of @@ -344,6 +349,7 @@ * Use unblocked code to reduce the rest of the matrix * CALL DGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO ) +* WORK( 1 ) = LWKOPT * RETURN diff --git a/lapack-netlib/SRC/dgelq.f b/lapack-netlib/SRC/dgelq.f index 013b6c356..255e8732f 100644 --- a/lapack-netlib/SRC/dgelq.f +++ b/lapack-netlib/SRC/dgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -166,6 +166,8 @@ *> the LQ factorization. *> \endverbatim *> +*> \ingroup gelq +*> * ===================================================================== SUBROUTINE DGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) diff --git a/lapack-netlib/SRC/dgelqf.f b/lapack-netlib/SRC/dgelqf.f index ed3372f96..f0eb00a55 100644 --- a/lapack-netlib/SRC/dgelqf.f +++ b/lapack-netlib/SRC/dgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -118,7 +119,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gelqf * *> \par Further Details: * ===================== @@ -174,9 +175,8 @@ * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'DGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -184,19 +184,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = LWKOPT RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN diff --git a/lapack-netlib/SRC/dgelsd.f b/lapack-netlib/SRC/dgelsd.f index b1f45a2c6..7dc564f48 100644 --- a/lapack-netlib/SRC/dgelsd.f +++ b/lapack-netlib/SRC/dgelsd.f @@ -188,7 +188,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEsolve +*> \ingroup gelsd * *> \par Contributors: * ================== @@ -228,7 +228,7 @@ DOUBLE PRECISION ANRM, BIGNUM, BNRM, EPS, SFMIN, SMLNUM * .. * .. External Subroutines .. - EXTERNAL DGEBRD, DGELQF, DGEQRF, DLABAD, DLACPY, DLALSD, + EXTERNAL DGEBRD, DGELQF, DGEQRF, DLACPY, DLALSD, $ DLASCL, DLASET, DORMBR, DORMLQ, DORMQR, XERBLA * .. * .. External Functions .. @@ -276,7 +276,7 @@ $ LOG( TWO ) ) + 1, 0 ) * IF( INFO.EQ.0 ) THEN - MAXWRK = 0 + MAXWRK = 1 LIWORK = 3*MINMN*NLVL + 11*MINMN MM = M IF( M.GE.N .AND. M.GE.MNTHR ) THEN @@ -372,7 +372,6 @@ SFMIN = DLAMCH( 'S' ) SMLNUM = SFMIN / EPS BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A if max entry outside range [SMLNUM,BIGNUM]. * diff --git a/lapack-netlib/SRC/dgemlq.f b/lapack-netlib/SRC/dgemlq.f index 3ba209105..757683f46 100644 --- a/lapack-netlib/SRC/dgemlq.f +++ b/lapack-netlib/SRC/dgemlq.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -160,11 +161,13 @@ *> block sizes MB and NB returned by ILAENV, DGELQ will use either *> DLASWLQ (if the matrix is wide-and-short) or DGELQT to compute *> the LQ factorization. -*> This version of DGEMLQ will use either DLAMSWLQ or DGEMLQT to +*> This version of DGEMLQ will use either DLAMSWLQ or DGEMLQT to *> multiply matrix Q by another matrix. *> Further Details in DLAMSWLQ or DGEMLQT. *> \endverbatim *> +*> \ingroup gemlq +*> * ===================================================================== SUBROUTINE DGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -186,7 +189,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -202,7 +205,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -217,6 +220,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -245,12 +255,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN @@ -262,7 +272,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -275,7 +285,7 @@ $ MB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = LWMIN * RETURN * diff --git a/lapack-netlib/SRC/dgemqr.f b/lapack-netlib/SRC/dgemqr.f index 022cf21e4..608815483 100644 --- a/lapack-netlib/SRC/dgemqr.f +++ b/lapack-netlib/SRC/dgemqr.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -160,12 +161,14 @@ *> block sizes MB and NB returned by ILAENV, DGEQR will use either *> DLATSQR (if the matrix is tall-and-skinny) or DGEQRT to compute *> the QR factorization. -*> This version of DGEMQR will use either DLAMTSQR or DGEMQRT to +*> This version of DGEMQR will use either DLAMTSQR or DGEMQRT to *> multiply matrix Q by another matrix. *> Further Details in DLATMSQR or DGEMQRT. *> *> \endverbatim *> +*> \ingroup gemqr +*> * ===================================================================== SUBROUTINE DGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -187,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -203,7 +206,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -218,6 +221,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -246,12 +256,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN @@ -263,7 +273,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -276,7 +286,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = LWMIN * RETURN * diff --git a/lapack-netlib/SRC/dgeqlf.f b/lapack-netlib/SRC/dgeqlf.f index b8ac0b1a0..a72d9dc76 100644 --- a/lapack-netlib/SRC/dgeqlf.f +++ b/lapack-netlib/SRC/dgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is the *> optimal blocksize. *> @@ -113,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup geqlf * *> \par Further Details: * ===================== @@ -188,8 +189,9 @@ END IF WORK( 1 ) = LWKOPT * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * diff --git a/lapack-netlib/SRC/dgeqp3rk.c b/lapack-netlib/SRC/dgeqp3rk.c new file mode 100644 index 000000000..17a78dd5a --- /dev/null +++ b/lapack-netlib/SRC/dgeqp3rk.c @@ -0,0 +1,1059 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + +/* Check if the matrix A contains +Inf or -Inf, set INFO parameter */ +/* to the column number, where the first +/-Inf is found plus N, */ +/* and continue the computation. */ + + *info = *n + kp1; + + } + +/* ================================================================== */ + +/* Quick return if possible for the case when the first */ +/* stopping criterion is satisfied, i.e. KMAX = 0. */ + + if (*kmax == 0) { + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.; + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + tau[j] = 0.; + } + work[1] = (doublereal) lwkopt; + return 0; + } + +/* ================================================================== */ + + eps = dlamch_("Epsilon"); + +/* Adjust ABSTOL */ + + if (*abstol >= 0.) { + safmin = dlamch_("Safe minimum"); +/* Computing MAX */ + d__1 = *abstol, d__2 = safmin * 2.; + *abstol = f2cmax(d__1,d__2); + } + +/* Adjust RELTOL */ + + if (*reltol >= 0.) { + *reltol = f2cmax(*reltol,eps); + } + +/* =================================================================== */ + +/* JMAX is the maximum index of the column to be factorized, */ +/* which is also limited by the first stopping criterion KMAX. */ + + jmax = f2cmin(*kmax,minmn); + +/* =================================================================== */ + +/* Quick return if possible for the case when the second or third */ +/* stopping criterion for the whole original matrix is satified, */ +/* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */ +/* (which is ONE <= RELTOL). */ + + if (maxc2nrm <= *abstol || 1. <= *reltol) { + + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.; + + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + tau[j] = 0.; + } + + work[1] = (doublereal) lwkopt; + return 0; + } + +/* ================================================================== */ +/* Factorize columns */ +/* ================================================================== */ + +/* Determine the block size. */ + + nbmin = 2; + nx = 0; + + if (nb > 1 && nb < minmn) { + +/* Determine when to cross over from blocked to unblocked code. */ +/* (for N less than NX, unblocked code should be used). */ + +/* Computing MAX */ + i__1 = 0, i__2 = ilaenv_(&c__3, "DGEQP3RK", " ", m, n, &c_n1, &c_n1, ( + ftnlen)8, (ftnlen)1); + nx = f2cmax(i__1,i__2); + + if (nx < minmn) { + +/* Determine if workspace is large enough for blocked code. */ + + if (*lwork < lwkopt) { + +/* Not enough workspace to use optimal block size that */ +/* is currently stored in NB. */ +/* Reduce NB and determine the minimum value of NB. */ + + nb = (*lwork - (*n << 1)) / (*n + 1); +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "DGEQP3RK", " ", m, n, &c_n1, + &c_n1, (ftnlen)8, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + } + } + } + +/* ================================================================== */ + +/* DONE is the boolean flag to rerpresent the case when the */ +/* factorization completed in the block factorization routine, */ +/* before the end of the block. */ + + done = FALSE_; + +/* J is the column index. */ + + j = 1; + +/* (1) Use blocked code initially. */ + +/* JMAXB is the maximum column index of the block, when the */ +/* blocked code is used, is also limited by the first stopping */ +/* criterion KMAX. */ + +/* Computing MIN */ + i__1 = *kmax, i__2 = minmn - nx; + jmaxb = f2cmin(i__1,i__2); + + if (nb >= nbmin && nb < jmax && jmaxb > 0) { + +/* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */ +/* J is the column index of a column block; */ +/* JB is the column block size to pass to block factorization */ +/* routine in a loop step; */ +/* JBF is the number of columns that were actually factorized */ +/* that was returned by the block factorization routine */ +/* in a loop step, JBF <= JB; */ +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + while(j <= jmaxb) { + +/* Computing MIN */ + i__1 = nb, i__2 = jmaxb - j + 1; + jb = f2cmin(i__1,i__2); + n_sub__ = *n - j + 1; + ioffset = j - 1; + +/* Factorize JB columns among the columns A(J:N). */ + + i__1 = *n + *nrhs - j + 1; + dlaqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, + &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, + maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], & + work[*n + j], &work[(*n << 1) + 1], &work[(*n << 1) + jb + + 1], &i__1, &iwork[1], &iinfo); + +/* Set INFO on the first occurence of Inf. */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } + + if (done) { + +/* Either the submatrix is zero before the end of the */ +/* column block, or ABSTOL or RELTOL criterion is */ +/* satisfied before the end of the column block, we can */ +/* return from the routine. Perform the following before */ +/* returning: */ +/* a) Set the number of factorized columns K, */ +/* K = IOFFSET + JBF from the last call of blocked */ +/* routine. */ +/* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */ +/* by the block factorization routine; */ +/* 2) The remaining TAUs are set to ZERO by the */ +/* block factorization routine. */ + + *k = ioffset + jbf; + +/* Set INFO on the first occurrence of NaN, NaN takes */ +/* prcedence over Inf. */ + + if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + +/* Return from the routine. */ + + work[1] = (doublereal) lwkopt; + + return 0; + + } + + j += jbf; + + } + + } + +/* Use unblocked code to factor the last or only block. */ +/* J = JMAX+1 means we factorized the maximum possible number of */ +/* columns, that is in ELSE clause we need to compute */ +/* the MAXC2NORM and RELMAXC2NORM to return after we processed */ +/* the blocks. */ + + if (j <= jmax) { + +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + n_sub__ = *n - j + 1; + ioffset = j - 1; + + i__1 = jmax - j + 1; + dlaqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, & + maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, + relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], &work[*n + j], & + work[(*n << 1) + 1], &iinfo); + +/* ABSTOL or RELTOL criterion is satisfied when the number of */ +/* the factorized columns KF is smaller then the number */ +/* of columns JMAX-J+1 supplied to be factorized by the */ +/* unblocked routine, we can return from */ +/* the routine. Perform the following before returning: */ +/* a) Set the number of factorized columns K, */ +/* b) MAXC2NRMK and RELMAXC2NRMK are returned by the */ +/* unblocked factorization routine above. */ + + *k = j - 1 + kf; + +/* Set INFO on the first exception occurence. */ + +/* Set INFO on the first exception occurence of Inf or NaN, */ +/* (NaN takes precedence over Inf). */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } else if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + + } else { + +/* Compute the return values for blocked code. */ + +/* Set the number of factorized columns if the unblocked routine */ +/* was not called. */ + + *k = jmax; + +/* If there exits a residual matrix after the blocked code: */ +/* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */ +/* residual matrix, otherwise set them to ZERO; */ +/* 2) Set TAU(K+1:MINMN) to ZERO. */ + + if (*k < minmn) { + i__1 = *n - *k; + jmaxc2nrm = *k + idamax_(&i__1, &work[*k + 1], &c__1); + *maxc2nrmk = work[jmaxc2nrm]; + if (*k == 0) { + *relmaxc2nrmk = 1.; + } else { + *relmaxc2nrmk = *maxc2nrmk / maxc2nrm; + } + + i__1 = minmn; + for (j = *k + 1; j <= i__1; ++j) { + tau[j] = 0.; + } + + } + +/* END IF( J.LE.JMAX ) THEN */ + + } + + work[1] = (doublereal) lwkopt; + + return 0; + +/* End of DGEQP3RK */ + +} /* dgeqp3rk_ */ + diff --git a/lapack-netlib/SRC/dgeqp3rk.f b/lapack-netlib/SRC/dgeqp3rk.f new file mode 100644 index 000000000..b8e41b39c --- /dev/null +++ b/lapack-netlib/SRC/dgeqp3rk.f @@ -0,0 +1,1082 @@ +*> \brief \b DGEQP3RK computes a truncated Householder QR factorization with column pivoting of a real m-by-n matrix A by using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DGEQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, +* $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ WORK, LWORK, IWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, K, KMAX, LDA, LWORK, M, N, NRHS +* DOUBLE PRECISION ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* DOUBLE PRECISION A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DGEQP3RK performs two tasks simultaneously: +*> +*> Task 1: The routine computes a truncated (rank K) or full rank +*> Householder QR factorization with column pivoting of a real +*> M-by-N matrix A using Level 3 BLAS. K is the number of columns +*> that were factorized, i.e. factorization rank of the +*> factor R, K <= min(M,N). +*> +*> A * P(K) = Q(K) * R(K) = +*> +*> = Q(K) * ( R11(K) R12(K) ) = Q(K) * ( R(K)_approx ) +*> ( 0 R22(K) ) ( 0 R(K)_residual ), +*> +*> where: +*> +*> P(K) is an N-by-N permutation matrix; +*> Q(K) is an M-by-M orthogonal matrix; +*> R(K)_approx = ( R11(K), R12(K) ) is a rank K approximation of the +*> full rank factor R with K-by-K upper-triangular +*> R11(K) and K-by-N rectangular R12(K). The diagonal +*> entries of R11(K) appear in non-increasing order +*> of absolute value, and absolute values of all of +*> them exceed the maximum column 2-norm of R22(K) +*> up to roundoff error. +*> R(K)_residual = R22(K) is the residual of a rank K approximation +*> of the full rank factor R. It is a +*> an (M-K)-by-(N-K) rectangular matrix; +*> 0 is a an (M-K)-by-K zero matrix. +*> +*> Task 2: At the same time, the routine overwrites a real M-by-NRHS +*> matrix B with Q(K)**T * B using Level 3 BLAS. +*> +*> ===================================================================== +*> +*> The matrices A and B are stored on input in the array A as +*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS) +*> respectively. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> The truncation criteria (i.e. when to stop the factorization) +*> can be any of the following: +*> +*> 1) The input parameter KMAX, the maximum number of columns +*> KMAX to factorize, i.e. the factorization rank is limited +*> to KMAX. If KMAX >= min(M,N), the criterion is not used. +*> +*> 2) The input parameter ABSTOL, the absolute tolerance for +*> the maximum column 2-norm of the residual matrix R22(K). This +*> means that the factorization stops if this norm is less or +*> equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used. +*> +*> 3) The input parameter RELTOL, the tolerance for the maximum +*> column 2-norm matrix of the residual matrix R22(K) divided +*> by the maximum column 2-norm of the original matrix A, which +*> is equal to abs(R(1,1)). This means that the factorization stops +*> when the ratio of the maximum column 2-norm of R22(K) to +*> the maximum column 2-norm of A is less than or equal to RELTOL. +*> If RELTOL < 0.0, the criterion is not used. +*> +*> 4) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix R22(K) is a zero matrix in some +*> factorization step K. ( This stopping criterion is implicit. ) +*> +*> The algorithm stops when any of these conditions is first +*> satisfied, otherwise the whole matrix A is factorized. +*> +*> To factorize the whole matrix A, use the values +*> KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0. +*> +*> The routine returns: +*> a) Q(K), R(K)_approx = ( R11(K), R12(K) ), +*> R(K)_residual = R22(K), P(K), i.e. the resulting matrices +*> of the factorization; P(K) is represented by JPIV, +*> ( if K = min(M,N), R(K)_approx is the full factor R, +*> and there is no residual matrix R(K)_residual); +*> b) K, the number of columns that were factorized, +*> i.e. factorization rank; +*> c) MAXC2NRMK, the maximum column 2-norm of the residual +*> matrix R(K)_residual = R22(K), +*> ( if K = min(M,N), MAXC2NRMK = 0.0 ); +*> d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum +*> column 2-norm of the original matrix A, which is equal +*> to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 ); +*> e) Q(K)**T * B, the matrix B with the orthogonal +*> transformation Q(K)**T applied on the left. +*> +*> The N-by-N permutation matrix P(K) is stored in a compact form in +*> the integer array JPIV. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The M-by-M orthogonal matrix Q is represented as a product +*> of elementary Householder reflectors +*> +*> Q(K) = H(1) * H(2) * . . . * H(K), +*> +*> where K is the number of columns that were factorized. +*> +*> Each H(j) has the form +*> +*> H(j) = I - tau * v * v**T, +*> +*> where 1 <= j <= K and +*> I is an M-by-M identity matrix, +*> tau is a real scalar, +*> v is a real vector with v(1:j-1) = 0 and v(j) = 1. +*> +*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j). +*> +*> See the Further Details section for more information. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e. the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M,N), then this stopping criterion +*> is not used, the routine factorizes columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B are not modified, and +*> the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION +*> +*> The second factorization stopping criterion, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix R22(K). +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix R22(K) +*> is less than or equal to ABSTOL. Let SAFMIN = DLAMCH('S'). +*> +*> a) If ABSTOL is NaN, then no computation is performed +*> and an error message ( INFO = -5 ) is issued +*> by XERBLA. +*> +*> b) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN +*> is used. This includes the case ABSTOL = -0.0. +*> +*> d) If 2*SAFMIN <= ABSTOL then the input value +*> of ABSTOL is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If ABSTOL chosen above is >= MAXC2NRM, then this +*> stopping criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. The routine +*> returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case ABSTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION +*> +*> The third factorization stopping criterion, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio +*> abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of +*> the residual matrix R22(K) to the maximum column 2-norm of +*> the original matrix A. The algorithm converges (stops the +*> factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less +*> than or equal to RELTOL. Let EPS = DLAMCH('E'). +*> +*> a) If RELTOL is NaN, then no computation is performed +*> and an error message ( INFO = -6 ) is issued +*> by XERBLA. +*> +*> b) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used. +*> This includes the case RELTOL = -0.0. +*> +*> d) If EPS <= RELTOL then the input value of RELTOL +*> is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If RELTOL chosen above is >= 1.0, then this stopping +*> criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. +*> The routine returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case RELTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> +*> NOTE: We recommend that RELTOL satisfy +*> min( max(M,N)*EPS, sqrt(EPS) ) <= RELTOL +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N+NRHS) +*> +*> On entry: +*> +*> a) The subarray A(1:M,1:N) contains the M-by-N matrix A. +*> b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS +*> matrix B. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> +*> a) The subarray A(1:M,1:N) contains parts of the factors +*> of the matrix A: +*> +*> 1) If K = 0, A(1:M,1:N) contains the original matrix A. +*> 2) If K > 0, A(1:M,1:N) contains parts of the +*> factors: +*> +*> 1. The elements below the diagonal of the subarray +*> A(1:M,1:K) together with TAU(1:K) represent the +*> orthogonal matrix Q(K) as a product of K Householder +*> elementary reflectors. +*> +*> 2. The elements on and above the diagonal of +*> the subarray A(1:K,1:N) contain K-by-N +*> upper-trapezoidal matrix +*> R(K)_approx = ( R11(K), R12(K) ). +*> NOTE: If K=min(M,N), i.e. full rank factorization, +*> then R_approx(K) is the full factor R which +*> is upper-trapezoidal. If, in addition, M>=N, +*> then R is upper-triangular. +*> +*> 3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K) +*> rectangular matrix R(K)_residual = R22(K). +*> +*> b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains +*> the M-by-NRHS product Q(K)**T * B. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> This is the leading dimension for both matrices, A and B. +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> +*> NOTE: If K = 0, a) the arrays A and B are not modified; +*> b) the array TAU(1:min(M,N)) is set to ZERO, +*> if the matrix A does not contain NaN, +*> otherwise the elements TAU(1:min(M,N)) +*> are undefined; +*> c) the elements of the array JPIV are set +*> as follows: for j = 1:N, JPIV(j) = j. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix R22(K), +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then MAXC2NRMK equals the maximum column 2-norm +*> of the original matrix A. +*> +*> b) If 0 < K < min(M,N), then MAXC2NRMK is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then MAXC2NRMK = 0.0. +*> +*> NOTE: MAXC2NRMK in the factorization step K would equal +*> R(K+1,K+1) in the next factorization step K+1. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix R22(K) (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then RELMAXC2NRMK = 1.0. +*> +*> b) If 0 < K < min(M,N), then +*> RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then RELMAXC2NRMK = 0.0. +*> +*> NOTE: RELMAXC2NRMK in the factorization step K would equal +*> abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization +*> step K+1. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The elements of the array JPIV(1:N) are always set +*> by the routine, for example, even when no columns +*> were factorized, i.e. when K = 0, the elements are +*> set as JPIV(j) = j for j = 1:N. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is DOUBLE PRECISION array, dimension (min(M,N)) +*> The scalar factors of the elementary reflectors. +*> +*> If 0 < K <= min(M,N), only the elements TAU(1:K) of +*> the array TAU are modified by the factorization. +*> After the factorization computed, if no NaN was found +*> during the factorization, the remaining elements +*> TAU(K+1:min(M,N)) are set to zero, otherwise the +*> elements TAU(K+1:min(M,N)) are not set and therefore +*> undefined. +*> ( If K = 0, all elements of TAU are set to zero, if +*> the matrix A does not contain NaN. ) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= (3*N+NRHS-1), otherwise. +*> For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )), +*> where NB is the optimal block size for DGEQP3RK returned +*> by ILAENV. Minimal block size MINNB=2. +*> +*> NOTE: The decision, whether to use unblocked BLAS 2 +*> or blocked BLAS 3 code is based not only on the dimension +*> LWORK of the availbale workspace WORK, but also also on the +*> matrix A dimension N via crossover point NX returned +*> by ILAENV. (For N less than NX, unblocked code should be +*> used.) +*> +*> If LWORK = -1, then a workspace query is assumed; +*> the routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix in the blocked step auxiliary subroutine DLAQP3RK ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) INFO < 0: if INFO = -i, the i-th argument had an +*> illegal value. +*> 3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup geqp3rk +* +*> \par Further Details: +* ===================== +* +*> \verbatim +*> DGEQP3RK is based on the same BLAS3 Householder QR factorization +*> algorithm with column pivoting as in DGEQP3 routine which uses +*> DLARFG routine to generate Householder reflectors +*> for QR factorization. +*> +*> We can also write: +*> +*> A = A_approx(K) + A_residual(K) +*> +*> The low rank approximation matrix A(K)_approx from +*> the truncated QR factorization of rank K of the matrix A is: +*> +*> A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T +*> ( 0 0 ) +*> +*> = Q(K) * ( R11(K) R12(K) ) * P(K)**T +*> ( 0 0 ) +*> +*> The residual A_residual(K) of the matrix A is: +*> +*> A_residual(K) = Q(K) * ( 0 0 ) * P(K)**T = +*> ( 0 R(K)_residual ) +*> +*> = Q(K) * ( 0 0 ) * P(K)**T +*> ( 0 R22(K) ) +*> +*> The truncated (rank K) factorization guarantees that +*> the maximum column 2-norm of A_residual(K) is less than +*> or equal to MAXC2NRMK up to roundoff error. +*> +*> NOTE: An approximation of the null vectors +*> of A can be easily computed from R11(K) +*> and R12(K): +*> +*> Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) ) +*> ( -I ) +*> +*> \endverbatim +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, + $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ WORK, LWORK, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS + DOUBLE PRECISION ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + DOUBLE PRECISION A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER INB, INBMIN, IXOVER + PARAMETER ( INB = 1, INBMIN = 2, IXOVER = 3 ) + DOUBLE PRECISION ZERO, ONE, TWO + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, DONE + INTEGER IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX, + $ JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB, + $ NBMIN, NX + DOUBLE PRECISION EPS, HUGEVAL, MAXC2NRM, SAFMIN +* .. +* .. External Subroutines .. + EXTERNAL DLAQP2RK, DLAQP3RK, XERBLA +* .. +* .. External Functions .. + LOGICAL DISNAN + INTEGER IDAMAX, ILAENV + DOUBLE PRECISION DLAMCH, DNRM2 + EXTERNAL DISNAN, DLAMCH, DNRM2, IDAMAX, ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test input arguments +* ==================== +* + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( KMAX.LT.0 ) THEN + INFO = -4 + ELSE IF( DISNAN( ABSTOL ) ) THEN + INFO = -5 + ELSE IF( DISNAN( RELTOL ) ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -8 + END IF +* +* If the input parameters M, N, NRHS, KMAX, LDA are valid: +* a) Test the input workspace size LWORK for the minimum +* size requirement IWS. +* b) Determine the optimal block size NB and optimal +* workspace size LWKOPT to be returned in WORK(1) +* in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., +* (3) when routine exits. +* Here, IWS is the miminum workspace required for unblocked +* code. +* + IF( INFO.EQ.0 ) THEN + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + IWS = 1 + LWKOPT = 1 + ELSE +* +* Minimal workspace size in case of using only unblocked +* BLAS 2 code in DLAQP2RK. +* 1) DGEQP3RK and DLAQP2RK: 2*N to store full and partial +* column 2-norms. +* 2) DLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in DLARF subroutine inside DLAQP2RK to apply an +* elementary reflector from the left. +* TOTAL_WORK_SIZE = 3*N + NRHS - 1 +* + IWS = 3*N + NRHS - 1 +* +* Assign to NB optimal block size. +* + NB = ILAENV( INB, 'DGEQP3RK', ' ', M, N, -1, -1 ) +* +* A formula for the optimal workspace size in case of using +* both unblocked BLAS 2 in DLAQP2RK and blocked BLAS 3 code +* in DLAQP3RK. +* 1) DGEQP3RK, DLAQP2RK, DLAQP3RK: 2*N to store full and +* partial column 2-norms. +* 2) DLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in DLARF subroutine to apply an elementary reflector +* from the left. +* 3) DLAQP3RK: NB*(N+NRHS) to use in the work array F that +* is used to apply a block reflector from +* the left. +* 4) DLAQP3RK: NB to use in the auxilixary array AUX. +* Sizes (2) and ((3) + (4)) should intersect, therefore +* TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. +* + LWKOPT = 2*N + NB*( N+NRHS+1 ) + END IF + WORK( 1 ) = DBLE( LWKOPT ) +* + IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN + INFO = -15 + END IF + END IF +* +* NOTE: The optimal workspace size is returned in WORK(1), if +* the input parameters M, N, NRHS, KMAX, LDA are valid. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGEQP3RK', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible for M=0 or N=0. +* + IF( MINMN.EQ.0 ) THEN + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + WORK( 1 ) = DBLE( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* +* Initialize column pivot array JPIV. +* + DO J = 1, N + JPIV( J ) = J + END DO +* +* ================================================================== +* +* Initialize storage for partial and exact column 2-norms. +* a) The elements WORK(1:N) are used to store partial column +* 2-norms of the matrix A, and may decrease in each computation +* step; initialize to the values of complete columns 2-norms. +* b) The elements WORK(N+1:2*N) are used to store complete column +* 2-norms of the matrix A, they are not changed during the +* computation; initialize the values of complete columns 2-norms. +* + DO J = 1, N + WORK( J ) = DNRM2( M, A( 1, J ), 1 ) + WORK( N+J ) = WORK( J ) + END DO +* +* ================================================================== +* +* Compute the pivot column index and the maximum column 2-norm +* for the whole original matrix stored in A(1:M,1:N). +* + KP1 = IDAMAX( N, WORK( 1 ), 1 ) + MAXC2NRM = WORK( KP1 ) +* +* ==================================================================. +* + IF( DISNAN( MAXC2NRM ) ) THEN +* +* Check if the matrix A contains NaN, set INFO parameter +* to the column number where the first NaN is found and return +* from the routine. +* + K = 0 + INFO = KP1 +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = MAXC2NRM +* +* Array TAU is not set and contains undefined elements. +* + WORK( 1 ) = DBLE( LWKOPT ) + RETURN + END IF +* +* =================================================================== +* + IF( MAXC2NRM.EQ.ZERO ) THEN +* +* Check is the matrix A is a zero matrix, set array TAU and +* return from the routine. +* + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO +* + DO J = 1, MINMN + TAU( J ) = ZERO + END DO +* + WORK( 1 ) = DBLE( LWKOPT ) + RETURN +* + END IF +* +* =================================================================== +* + HUGEVAL = DLAMCH( 'Overflow' ) +* + IF( MAXC2NRM.GT.HUGEVAL ) THEN +* +* Check if the matrix A contains +Inf or -Inf, set INFO parameter +* to the column number, where the first +/-Inf is found plus N, +* and continue the computation. +* + INFO = N + KP1 +* + END IF +* +* ================================================================== +* +* Quick return if possible for the case when the first +* stopping criterion is satisfied, i.e. KMAX = 0. +* + IF( KMAX.EQ.0 ) THEN + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE + DO J = 1, MINMN + TAU( J ) = ZERO + END DO + WORK( 1 ) = DBLE( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* + EPS = DLAMCH('Epsilon') +* +* Adjust ABSTOL +* + IF( ABSTOL.GE.ZERO ) THEN + SAFMIN = DLAMCH('Safe minimum') + ABSTOL = MAX( ABSTOL, TWO*SAFMIN ) + END IF +* +* Adjust RELTOL +* + IF( RELTOL.GE.ZERO ) THEN + RELTOL = MAX( RELTOL, EPS ) + END IF +* +* =================================================================== +* +* JMAX is the maximum index of the column to be factorized, +* which is also limited by the first stopping criterion KMAX. +* + JMAX = MIN( KMAX, MINMN ) +* +* =================================================================== +* +* Quick return if possible for the case when the second or third +* stopping criterion for the whole original matrix is satified, +* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL +* (which is ONE <= RELTOL). +* + IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN +* + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE +* + DO J = 1, MINMN + TAU( J ) = ZERO + END DO +* + WORK( 1 ) = DBLE( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* Factorize columns +* ================================================================== +* +* Determine the block size. +* + NBMIN = 2 + NX = 0 +* + IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN +* +* Determine when to cross over from blocked to unblocked code. +* (for N less than NX, unblocked code should be used). +* + NX = MAX( 0, ILAENV( IXOVER, 'DGEQP3RK', ' ', M, N, -1, -1 )) +* + IF( NX.LT.MINMN ) THEN +* +* Determine if workspace is large enough for blocked code. +* + IF( LWORK.LT.LWKOPT ) THEN +* +* Not enough workspace to use optimal block size that +* is currently stored in NB. +* Reduce NB and determine the minimum value of NB. +* + NB = ( LWORK-2*N ) / ( N+1 ) + NBMIN = MAX( 2, ILAENV( INBMIN, 'DGEQP3RK', ' ', M, N, + $ -1, -1 ) ) +* + END IF + END IF + END IF +* +* ================================================================== +* +* DONE is the boolean flag to rerpresent the case when the +* factorization completed in the block factorization routine, +* before the end of the block. +* + DONE = .FALSE. +* +* J is the column index. +* + J = 1 +* +* (1) Use blocked code initially. +* +* JMAXB is the maximum column index of the block, when the +* blocked code is used, is also limited by the first stopping +* criterion KMAX. +* + JMAXB = MIN( KMAX, MINMN - NX ) +* + IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN +* +* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: +* J is the column index of a column block; +* JB is the column block size to pass to block factorization +* routine in a loop step; +* JBF is the number of columns that were actually factorized +* that was returned by the block factorization routine +* in a loop step, JBF <= JB; +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + DO WHILE( J.LE.JMAXB ) +* + JB = MIN( NB, JMAXB-J+1 ) + N_SUB = N-J+1 + IOFFSET = J-1 +* +* Factorize JB columns among the columns A(J:N). +* + CALL DLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ DONE, JBF, MAXC2NRMK, RELMAXC2NRMK, + $ JPIV( J ), TAU( J ), + $ WORK( J ), WORK( N+J ), + $ WORK( 2*N+1 ), WORK( 2*N+JB+1 ), + $ N+NRHS-J+1, IWORK, IINFO ) +* +* Set INFO on the first occurence of Inf. +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + END IF +* + IF( DONE ) THEN +* +* Either the submatrix is zero before the end of the +* column block, or ABSTOL or RELTOL criterion is +* satisfied before the end of the column block, we can +* return from the routine. Perform the following before +* returning: +* a) Set the number of factorized columns K, +* K = IOFFSET + JBF from the last call of blocked +* routine. +* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned +* by the block factorization routine; +* 2) The remaining TAUs are set to ZERO by the +* block factorization routine. +* + K = IOFFSET + JBF +* +* Set INFO on the first occurrence of NaN, NaN takes +* prcedence over Inf. +* + IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* +* Return from the routine. +* + WORK( 1 ) = DBLE( LWKOPT ) +* + RETURN +* + END IF +* + J = J + JBF +* + END DO +* + END IF +* +* Use unblocked code to factor the last or only block. +* J = JMAX+1 means we factorized the maximum possible number of +* columns, that is in ELSE clause we need to compute +* the MAXC2NORM and RELMAXC2NORM to return after we processed +* the blocks. +* + IF( J.LE.JMAX ) THEN +* +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + N_SUB = N-J+1 + IOFFSET = J-1 +* + CALL DLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1, + $ ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ), + $ TAU( J ), WORK( J ), WORK( N+J ), + $ WORK( 2*N+1 ), IINFO ) +* +* ABSTOL or RELTOL criterion is satisfied when the number of +* the factorized columns KF is smaller then the number +* of columns JMAX-J+1 supplied to be factorized by the +* unblocked routine, we can return from +* the routine. Perform the following before returning: +* a) Set the number of factorized columns K, +* b) MAXC2NRMK and RELMAXC2NRMK are returned by the +* unblocked factorization routine above. +* + K = J - 1 + KF +* +* Set INFO on the first exception occurence. +* +* Set INFO on the first exception occurence of Inf or NaN, +* (NaN takes precedence over Inf). +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* + ELSE +* +* Compute the return values for blocked code. +* +* Set the number of factorized columns if the unblocked routine +* was not called. +* + K = JMAX +* +* If there exits a residual matrix after the blocked code: +* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the +* residual matrix, otherwise set them to ZERO; +* 2) Set TAU(K+1:MINMN) to ZERO. +* + IF( K.LT.MINMN ) THEN + JMAXC2NRM = K + IDAMAX( N-K, WORK( K+1 ), 1 ) + MAXC2NRMK = WORK( JMAXC2NRM ) + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + DO J = K + 1, MINMN + TAU( J ) = ZERO + END DO +* + END IF +* +* END IF( J.LE.JMAX ) THEN +* + END IF +* + WORK( 1 ) = DBLE( LWKOPT ) +* + RETURN +* +* End of DGEQP3RK +* + END diff --git a/lapack-netlib/SRC/dgeqr.f b/lapack-netlib/SRC/dgeqr.f index eac8930ce..6ed8f211f 100644 --- a/lapack-netlib/SRC/dgeqr.f +++ b/lapack-netlib/SRC/dgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE DGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -244,8 +246,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +257,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +272,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +286,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +313,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/dgeqrfp.f b/lapack-netlib/SRC/dgeqrfp.f index 46d2ee479..aa757e96c 100644 --- a/lapack-netlib/SRC/dgeqrfp.f +++ b/lapack-netlib/SRC/dgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -122,7 +123,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup geqrfp * *> \par Further Details: * ===================== @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL DGEQR2P, DLARFB, DLARFT, XERBLA @@ -181,8 +182,16 @@ * INFO = 0 NB = ILAENV( 1, 'DGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF WORK( 1 ) = LWKOPT +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -190,7 +199,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -202,7 +211,6 @@ * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -210,7 +218,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. diff --git a/lapack-netlib/SRC/dgerqf.f b/lapack-netlib/SRC/dgerqf.f index cca9d6367..435239cc7 100644 --- a/lapack-netlib/SRC/dgerqf.f +++ b/lapack-netlib/SRC/dgerqf.f @@ -114,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gerqf * *> \par Further Details: * ===================== @@ -189,7 +189,7 @@ END IF WORK( 1 ) = LWKOPT * - IF ( .NOT.LQUERY ) THEN + IF( .NOT.LQUERY ) THEN IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) $ INFO = -7 END IF diff --git a/lapack-netlib/SRC/dgesvj.f b/lapack-netlib/SRC/dgesvj.f index 5fdb21e45..198bfb0a5 100644 --- a/lapack-netlib/SRC/dgesvj.f +++ b/lapack-netlib/SRC/dgesvj.f @@ -208,7 +208,7 @@ *> *> \param[in,out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On entry : *> If JOBU = 'C' : *> WORK(1) = CTOL, where CTOL defines the threshold for convergence. @@ -239,7 +239,12 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> length of WORK, WORK >= MAX(6,M+N) +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(6,M+N), otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; WORK(1) is set to the minial (and optimal) +*> length of WORK. *> \endverbatim *> *> \param[out] INFO @@ -260,7 +265,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -365,9 +370,9 @@ INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, - $ SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + $ SWBAND, MINMN, LWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, + $ ROTOK, RSVEC, UCTOL, UPPER * .. * .. Local Arrays .. DOUBLE PRECISION FASTR( 5 ) @@ -408,6 +413,14 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 6, M+N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -427,7 +440,7 @@ INFO = -11 ELSE IF( UCTOL .AND. ( WORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( LWORK.LT.MAX( M+N, 6 ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 ELSE INFO = 0 @@ -437,11 +450,14 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGESVJ', -INFO ) RETURN + ELSE IF( LQUERY ) THEN + WORK( 1 ) = LWMIN + RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/dgetri.f b/lapack-netlib/SRC/dgetri.f index 92ef90c18..7b5a3a1b6 100644 --- a/lapack-netlib/SRC/dgetri.f +++ b/lapack-netlib/SRC/dgetri.f @@ -107,7 +107,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup getri * * ===================================================================== SUBROUTINE DGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -151,8 +151,9 @@ * INFO = 0 NB = ILAENV( 1, 'DGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT +* LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/dgetsls.f b/lapack-netlib/SRC/dgetsls.f index 25f4c12c2..73b505ff7 100644 --- a/lapack-netlib/SRC/dgetsls.f +++ b/lapack-netlib/SRC/dgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEsolve +*> \ingroup getsls * * ===================================================================== SUBROUTINE DGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, @@ -189,7 +189,7 @@ * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH, DLANGE - EXTERNAL LSAME, DLABAD, DLAMCH, DLANGE + EXTERNAL LSAME, DLAMCH, DLANGE * .. * .. External Subroutines .. EXTERNAL DGEQR, DGEMQR, DLASCL, DLASET, @@ -226,7 +226,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEM = 1 + WSIZEO = 1 + ELSE IF( M.GE.N ) THEN CALL DGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) @@ -294,7 +297,6 @@ * SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * diff --git a/lapack-netlib/SRC/dgetsqrhrt.f b/lapack-netlib/SRC/dgetsqrhrt.f index 668deeba8..682c7c30f 100644 --- a/lapack-netlib/SRC/dgetsqrhrt.f +++ b/lapack-netlib/SRC/dgetsqrhrt.f @@ -130,14 +130,17 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -160,7 +163,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup getsqrhrt * *> \par Contributors: * ================== @@ -212,7 +215,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -225,7 +228,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -263,8 +266,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * @@ -346,4 +350,4 @@ * * End of DGETSQRHRT * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/dgges.f b/lapack-netlib/SRC/dgges.f index 31db23715..b9ffc7982 100644 --- a/lapack-netlib/SRC/dgges.f +++ b/lapack-netlib/SRC/dgges.f @@ -234,8 +234,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If N = 0, LWORK >= 1, else LWORK >= 8*N+16. -*> For good performance , LWORK must generally be larger. +*> If N = 0, LWORK >= 1, else LWORK >= MAX(8*N,6*N+16). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -275,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEeigen +*> \ingroup gges * * ===================================================================== SUBROUTINE DGGES( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, LDB, @@ -321,9 +321,8 @@ DOUBLE PRECISION DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHRD, DHGEQZ, DLABAD, - $ DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, - $ XERBLA + EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHRD, DHGEQZ, DLACPY, + $ DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, XERBLA * .. * .. External Functions .. LOGICAL LSAME @@ -431,7 +430,6 @@ EPS = DLAMCH( 'P' ) SAFMIN = DLAMCH( 'S' ) SAFMAX = ONE / SAFMIN - CALL DLABAD( SAFMIN, SAFMAX ) SMLNUM = SQRT( SAFMIN ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/dgges3.f b/lapack-netlib/SRC/dgges3.f index 7b00d294a..2ef55951a 100644 --- a/lapack-netlib/SRC/dgges3.f +++ b/lapack-netlib/SRC/dgges3.f @@ -234,6 +234,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK >= 6*N+16. +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -273,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEeigen +*> \ingroup gges3 * * ===================================================================== SUBROUTINE DGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, @@ -309,7 +311,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, LST2SL, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT + $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT, + $ LWKMIN DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SAFMAX, SAFMIN, SMLNUM * .. @@ -318,9 +321,8 @@ DOUBLE PRECISION DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLABAD, - $ DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, - $ XERBLA + EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLACPY, + $ DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, XERBLA * .. * .. External Functions .. LOGICAL LSAME @@ -362,6 +364,12 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + IF( N.EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 6*N+16 + END IF +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -378,7 +386,7 @@ INFO = -15 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -17 - ELSE IF( LWORK.LT.6*N+16 .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -19 END IF * @@ -386,29 +394,33 @@ * IF( INFO.EQ.0 ) THEN CALL DGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 6*N+16, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL DORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) IF( ILVSL ) THEN CALL DORGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) END IF CALL DGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL, $ LDVSL, VSR, LDVSR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL DLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VSL, LDVSL, VSR, LDVSR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) IF( WANTST ) THEN CALL DTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VSL, LDVSL, VSR, LDVSR, $ SDIM, PVSL, PVSR, DIF, WORK, -1, IDUM, 1, $ IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = LWKOPT END IF - WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN @@ -430,7 +442,6 @@ EPS = DLAMCH( 'P' ) SAFMIN = DLAMCH( 'S' ) SAFMAX = ONE / SAFMIN - CALL DLABAD( SAFMIN, SAFMAX ) SMLNUM = SQRT( SAFMIN ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/dggev3.f b/lapack-netlib/SRC/dggev3.f index 4bbe8a40f..b970c04c4 100644 --- a/lapack-netlib/SRC/dggev3.f +++ b/lapack-netlib/SRC/dggev3.f @@ -188,7 +188,9 @@ *> *> \param[in] LWORK *> \verbatim -*> LWORK is INTEGER +*> LWORK is INTEGER. +*> The dimension of the array WORK. LWORK >= MAX(1,8*N). +*> For good performance, LWORK should generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -217,7 +219,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEeigen +*> \ingroup ggev3 * * ===================================================================== SUBROUTINE DGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHAR, @@ -248,7 +250,8 @@ LOGICAL ILASCL, ILBSCL, ILV, ILVL, ILVR, LQUERY CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, - $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT + $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT, + $ LWKMIN DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP * .. @@ -256,9 +259,8 @@ LOGICAL LDUMMA( 1 ) * .. * .. External Subroutines .. - EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLABAD, - $ DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGEVC, - $ XERBLA + EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLACPY, + $ DLASCL, DLASET, DORGQR, DORMQR, DTGEVC, XERBLA * .. * .. External Functions .. LOGICAL LSAME @@ -299,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 8*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -313,7 +316,7 @@ INFO = -12 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -14 - ELSE IF( LWORK.LT.MAX( 1, 8*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF * @@ -321,13 +324,13 @@ * IF( INFO.EQ.0 ) THEN CALL DGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX(1, 8*N, 3*N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL DORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, -1, $ IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) IF( ILVL ) THEN CALL DORGQR( N, N, N, VL, LDVL, WORK, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) END IF IF( ILV ) THEN CALL DGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL, @@ -336,18 +339,21 @@ CALL DLAQZ0( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) ELSE CALL DGGHD3( 'N', 'N', N, 1, N, A, LDA, B, LDB, VL, LDVL, $ VR, LDVR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL DLAQZ0( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = LWKOPT END IF - - WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN @@ -367,7 +373,6 @@ EPS = DLAMCH( 'P' ) SMLNUM = DLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/dgghd3.f b/lapack-netlib/SRC/dgghd3.f index 43d7a77df..21a668573 100644 --- a/lapack-netlib/SRC/dgghd3.f +++ b/lapack-netlib/SRC/dgghd3.f @@ -179,14 +179,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> -*> \param[in] LWORK +*> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -211,7 +211,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup gghd3 * *> \par Further Details: * ===================== @@ -275,7 +275,12 @@ * INFO = 0 NB = ILAENV( 1, 'DGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF WORK( 1 ) = DBLE( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) @@ -325,7 +330,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = ONE RETURN @@ -885,6 +889,7 @@ IF ( JCOL.LT.IHI ) $ CALL DGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) +* WORK( 1 ) = DBLE( LWKOPT ) * RETURN diff --git a/lapack-netlib/SRC/dggqrf.f b/lapack-netlib/SRC/dggqrf.f index 39d27a5c9..edac7f22f 100644 --- a/lapack-netlib/SRC/dggqrf.f +++ b/lapack-netlib/SRC/dggqrf.f @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup ggqrf * *> \par Further Details: * ===================== @@ -250,7 +250,7 @@ NB2 = ILAENV( 1, 'DGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'DORMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN @@ -287,6 +287,7 @@ * RQ factorization of N-by-P matrix B: B = T*Z. * CALL DGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO ) +* WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) * RETURN diff --git a/lapack-netlib/SRC/dggrqf.f b/lapack-netlib/SRC/dggrqf.f index ddf4104c5..3b1024c1c 100644 --- a/lapack-netlib/SRC/dggrqf.f +++ b/lapack-netlib/SRC/dggrqf.f @@ -172,7 +172,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup ggrqf * *> \par Further Details: * ===================== @@ -249,7 +249,7 @@ NB2 = ILAENV( 1, 'DGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'DORMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN diff --git a/lapack-netlib/SRC/dggsvd3.f b/lapack-netlib/SRC/dggsvd3.f index 503f0d8cc..ee4d11e86 100644 --- a/lapack-netlib/SRC/dggsvd3.f +++ b/lapack-netlib/SRC/dggsvd3.f @@ -278,7 +278,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -328,7 +328,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEsing +*> \ingroup ggsvd3 * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/dggsvp3.f b/lapack-netlib/SRC/dggsvp3.f index 4e1db3117..485d95b36 100644 --- a/lapack-netlib/SRC/dggsvp3.f +++ b/lapack-netlib/SRC/dggsvp3.f @@ -227,7 +227,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -250,7 +250,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup ggsvp3 * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/dlamswlq.f b/lapack-netlib/SRC/dlamswlq.f index 70e78f4b1..07ef1bd57 100644 --- a/lapack-netlib/SRC/dlamswlq.f +++ b/lapack-netlib/SRC/dlamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -189,29 +192,31 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamswlq +*> * ===================================================================== SUBROUTINE DLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + DOUBLE PRECISION A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, CTR, LW + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, CTR, LW, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -223,52 +228,60 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * INFO = 0 IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLAMSWLQ', -INFO ) - WORK(1) = LW RETURN - ELSE IF (LQUERY) THEN - WORK(1) = LW + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -402,7 +415,8 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN +* RETURN * * End of DLAMSWLQ diff --git a/lapack-netlib/SRC/dlamtsqr.f b/lapack-netlib/SRC/dlamtsqr.f index 962a31476..023db5ac9 100644 --- a/lapack-netlib/SRC/dlamtsqr.f +++ b/lapack-netlib/SRC/dlamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) -*> +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -191,29 +193,31 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamtsqr +*> * ===================================================================== SUBROUTINE DLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + DOUBLE PRECISION A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -225,12 +229,13 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -238,11 +243,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -253,38 +264,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = LW + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL DGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -410,7 +421,8 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN +* RETURN * * End of DLAMTSQR diff --git a/lapack-netlib/SRC/dlaqp2rk.c b/lapack-netlib/SRC/dlaqp2rk.c new file mode 100644 index 000000000..de216ad97 --- /dev/null +++ b/lapack-netlib/SRC/dlaqp2rk.c @@ -0,0 +1,923 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + *info = *n + kk - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL >= ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL >= ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + +/* Set K, the number of factorized columns. */ + + *k = kk - 1; + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. */ + + i__2 = minmnfact; + for (j = kk; j <= i__2; ++j) { + tau[j] = 0.; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,KK:N): */ +/* 1) swap the KK-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) copy the KK-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than KK in the next loop step.) */ +/* 3) Save the pivot interchange with the indices relative to the */ +/* the original matrix A, not the block A(1:M,1:N). */ + + if (kp != kk) { + dswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1); + vn1[kp] = vn1[kk]; + vn2[kp] = vn2[kk]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[kk]; + jpiv[kk] = itemp; + } + +/* Generate elementary reflector H(KK) using the column A(I:M,KK), */ +/* if the column has more than one element, otherwise */ +/* the elementary reflector would be an identity matrix, */ +/* and TAU(KK) = ZERO. */ + + if (i__ < *m) { + i__2 = *m - i__ + 1; + dlarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], & + c__1, &tau[kk]); + } else { + tau[kk] = 0.; + } + +/* Check if TAU(KK) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(KK) for Inf, */ +/* since DLARFG cannot produce TAU(KK) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by DLARFG can contain Inf, which requires */ +/* TAU(KK) to contain NaN. Therefore, this case of generating Inf */ +/* by DLARFG is covered by checking TAU(KK) for NaN. */ + + if (disnan_(&tau[kk])) { + *k = kk - 1; + *info = kk; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = tau[kk]; + *relmaxc2nrmk = tau[kk]; + +/* Array TAU(KK:MINMNFACT) is not set and contains */ +/* undefined elements, except the first element TAU(KK) = NaN. */ + + return 0; + } + +/* Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left. */ +/* ( If M >= N, then at KK = N there is no residual matrix, */ +/* i.e. no columns of A to update, only columns of B. */ +/* If M < N, then at KK = M-IOFFSET, I = M and we have a */ +/* one-row residual matrix in A and the elementary */ +/* reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update */ +/* is needed for the residual matrix in A and the */ +/* right-hand-side-matrix in B. */ +/* Therefore, we update only if */ +/* KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */ +/* condition is satisfied, not only KK < N+NRHS ) */ + + if (kk < minmnupdt) { + aikk = a[i__ + kk * a_dim1]; + a[i__ + kk * a_dim1] = 1.; + i__2 = *m - i__ + 1; + i__3 = *n + *nrhs - kk; + dlarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &tau[ + kk], &a[i__ + (kk + 1) * a_dim1], lda, &work[1]); + a[i__ + kk * a_dim1] = aikk; + } + + if (kk < minmnfact) { + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */ +/* when KK < f2cmin(M-IOFFSET, N). */ + + i__2 = *n; + for (j = kk + 1; j <= i__2; ++j) { + if (vn1[j] != 0.) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + +/* Computing 2nd power */ + d__2 = (d__1 = a[i__ + j * a_dim1], abs(d__1)) / vn1[j]; + temp = 1. - d__2 * d__2; + temp = f2cmax(temp,0.); +/* Computing 2nd power */ + d__1 = vn1[j] / vn2[j]; + temp2 = temp * (d__1 * d__1); + if (temp2 <= tol3z) { + +/* Compute the column 2-norm for the partial */ +/* column A(I+1:M,J) by explicitly computing it, */ +/* and store it in both partial 2-norm vector VN1 */ +/* and exact column 2-norm vector VN2. */ + + i__3 = *m - i__; + vn1[j] = dnrm2_(&i__3, &a[i__ + 1 + j * a_dim1], & + c__1); + vn2[j] = vn1[j]; + + } else { + +/* Update the column 2-norm for the partial */ +/* column A(I+1:M,J) by removing one */ +/* element A(I,J) and store it in partial */ +/* 2-norm vector VN1. */ + + vn1[j] *= sqrt(temp); + + } + } + } + + } + +/* End factorization loop */ + + } + +/* If we reached this point, all colunms have been factorized, */ +/* i.e. no condition was triggered to exit the routine. */ +/* Set the number of factorized columns. */ + + *k = *kmax; + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */ +/* we return. */ + + if (*k < minmnfact) { + + i__1 = *n - *k; + jmaxc2nrm = *k + idamax_(&i__1, &vn1[*k + 1], &c__1); + *maxc2nrmk = vn1[jmaxc2nrm]; + + if (*k == 0) { + *relmaxc2nrmk = 1.; + } else { + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + } + + } else { + *maxc2nrmk = 0.; + *relmaxc2nrmk = 0.; + } + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, set TAUs corresponding to the columns that were */ +/* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO. */ + + i__1 = minmnfact; + for (j = *k + 1; j <= i__1; ++j) { + tau[j] = 0.; + } + + return 0; + +/* End of DLAQP2RK */ + +} /* dlaqp2rk_ */ + diff --git a/lapack-netlib/SRC/dlaqp2rk.f b/lapack-netlib/SRC/dlaqp2rk.f new file mode 100644 index 000000000..aecd6bb69 --- /dev/null +++ b/lapack-netlib/SRC/dlaqp2rk.f @@ -0,0 +1,713 @@ +*> \brief \b DLAQP2RK computes truncated QR factorization with column pivoting of a real matrix block using Level 2 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DLAQP2RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, +* $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, +* $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, +* $ INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS +* DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER JPIV( * ) +* DOUBLE PRECISION A( LDA, * ), TAU( * ), VN1( * ), VN2( * ), +* $ WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLAQP2RK computes a truncated (rank K) or full rank Householder QR +*> factorization with column pivoting of a real matrix +*> block A(IOFFSET+1:M,1:N) as +*> +*> A * P(K) = Q(K) * R(K). +*> +*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N) +*> is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides matrix block B +*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**T * B. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M-IOFFSET,N), then this stopping +*> criterion is not used, factorize columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The second factorization stopping criterion. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The third factorization stopping criterion. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine DGEQP3RK. 1 <= KP1 <= N_orig_mat. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is DOUBLE PRECISION +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine DGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:K) below +*> the diagonal together with the array TAU represent +*> the orthogonal matrix Q(K) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:K) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS). +*> The left part A(IOFFSET+1:M,K+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(K)**T. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is DOUBLE PRECISION array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is DOUBLE PRECISION array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is DOUBLE PRECISION array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (N-1) +*> Used in DLARF subroutine to apply an elementary +*> reflector from the left. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp2rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, + $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, + $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, + $ INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS + DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER JPIV( * ) + DOUBLE PRECISION A( LDA, * ), TAU( * ), VN1( * ), VN2( * ), + $ WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + INTEGER I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT, + $ MINMNUPDT + DOUBLE PRECISION AIKK, HUGEVAL, TEMP, TEMP2, TOL3Z +* .. +* .. External Subroutines .. + EXTERNAL DLARF, DLARFG, DSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL DISNAN + INTEGER IDAMAX + DOUBLE PRECISION DLAMCH, DNRM2 + EXTERNAL DISNAN, DLAMCH, IDAMAX, DNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* +* MINMNUPDT is the smallest dimension +* of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which +* contains the submatrices A(IOFFSET+1:M,1:N) and +* B(IOFFSET+1:M,1:NRHS) as column blocks. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + KMAX = MIN( KMAX, MINMNFACT ) + TOL3Z = SQRT( DLAMCH( 'Epsilon' ) ) + HUGEVAL = DLAMCH( 'Overflow' ) +* +* Compute the factorization, KK is the lomn loop index. +* + DO KK = 1, KMAX +* + I = IOFFSET + KK +* + IF( I.EQ.1 ) THEN +* +* ============================================================ +* +* We are at the first column of the original whole matrix A, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + + KP = KP1 +* +* ============================================================ +* + ELSE +* +* ============================================================ +* +* Determine the pivot column in KK-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( KK-1 ) + IDAMAX( N-KK+1, VN1( KK ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,KK:N) in step KK. +* RELMAXC2NRMK will be computed later, after somecondition +* checks on MAXC2NRMK. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains NaN, and set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( DISNAN( MAXC2NRMK ) ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + INFO = K + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* Array TAU(K+1:MINMNFACT) is not set and contains +* undefined elements. +* + RETURN + END IF +* +* ============================================================ +* +* Quick return, if the submatrix A(I:M,KK:N) is +* a zero matrix. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + RELMAXC2NRMK = ZERO +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + KK - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third stopping criteria. +* NOTE: There is no need to test for ABSTOL >= ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL >= ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. + + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* +* Set K, the number of factorized columns. +* + K = KK - 1 +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,KK:N): +* 1) swap the KK-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) copy the KK-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than KK in the next loop step.) +* 3) Save the pivot interchange with the indices relative to the +* the original matrix A, not the block A(1:M,1:N). +* + IF( KP.NE.KK ) THEN + CALL DSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 ) + VN1( KP ) = VN1( KK ) + VN2( KP ) = VN2( KK ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( KK ) + JPIV( KK ) = ITEMP + END IF +* +* Generate elementary reflector H(KK) using the column A(I:M,KK), +* if the column has more than one element, otherwise +* the elementary reflector would be an identity matrix, +* and TAU(KK) = ZERO. +* + IF( I.LT.M ) THEN + CALL DLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1, + $ TAU( KK ) ) + ELSE + TAU( KK ) = ZERO + END IF +* +* Check if TAU(KK) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(KK) for Inf, +* since DLARFG cannot produce TAU(KK) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by DLARFG can contain Inf, which requires +* TAU(KK) to contain NaN. Therefore, this case of generating Inf +* by DLARFG is covered by checking TAU(KK) for NaN. +* + IF( DISNAN( TAU(KK) ) ) THEN + K = KK - 1 + INFO = KK +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAU( KK ) + RELMAXC2NRMK = TAU( KK ) +* +* Array TAU(KK:MINMNFACT) is not set and contains +* undefined elements, except the first element TAU(KK) = NaN. +* + RETURN + END IF +* +* Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left. +* ( If M >= N, then at KK = N there is no residual matrix, +* i.e. no columns of A to update, only columns of B. +* If M < N, then at KK = M-IOFFSET, I = M and we have a +* one-row residual matrix in A and the elementary +* reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update +* is needed for the residual matrix in A and the +* right-hand-side-matrix in B. +* Therefore, we update only if +* KK < MINMNUPDT = min(M-IOFFSET, N+NRHS) +* condition is satisfied, not only KK < N+NRHS ) +* + IF( KK.LT.MINMNUPDT ) THEN + AIKK = A( I, KK ) + A( I, KK ) = ONE + CALL DLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1, + $ TAU( KK ), A( I, KK+1 ), LDA, WORK( 1 ) ) + A( I, KK ) = AIKK + END IF +* + IF( KK.LT.MINMNFACT ) THEN +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. +* when KK < min(M-IOFFSET, N). +* + DO J = KK + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2 + TEMP = MAX( TEMP, ZERO ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2 .LE. TOL3Z ) THEN +* +* Compute the column 2-norm for the partial +* column A(I+1:M,J) by explicitly computing it, +* and store it in both partial 2-norm vector VN1 +* and exact column 2-norm vector VN2. +* + VN1( J ) = DNRM2( M-I, A( I+1, J ), 1 ) + VN2( J ) = VN1( J ) +* + ELSE +* +* Update the column 2-norm for the partial +* column A(I+1:M,J) by removing one +* element A(I,J) and store it in partial +* 2-norm vector VN1. +* + VN1( J ) = VN1( J )*SQRT( TEMP ) +* + END IF + END IF + END DO +* + END IF +* +* End factorization loop +* + END DO +* +* If we reached this point, all colunms have been factorized, +* i.e. no condition was triggered to exit the routine. +* Set the number of factorized columns. +* + K = KMAX +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before +* we return. +* + IF( K.LT.MINMNFACT ) THEN +* + JMAXC2NRM = K + IDAMAX( N-K, VN1( K+1 ), 1 ) + MAXC2NRMK = VN1( JMAXC2NRM ) +* + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + ELSE + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + END IF +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, set TAUs corresponding to the columns that were +* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO. +* + DO J = K + 1, MINMNFACT + TAU( J ) = ZERO + END DO +* + RETURN +* +* End of DLAQP2RK +* + END diff --git a/lapack-netlib/SRC/dlaqp3rk.c b/lapack-netlib/SRC/dlaqp3rk.c new file mode 100644 index 000000000..e8c61c257 --- /dev/null +++ b/lapack-netlib/SRC/dlaqp3rk.c @@ -0,0 +1,1113 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + dgemm_("No transpose", "Transpose", &i__1, nrhs, kb, & + c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* Quick return, if the submatrix A(I:M,K:N) is */ +/* a zero matrix. We need to check it only if the column index */ +/* (same as row index) is larger than 1, since the condition */ +/* for the whole original matrix A_orig is checked in the main */ +/* routine. */ + + if (*maxc2nrmk == 0.) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *relmaxc2nrmk = 0.; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix is zero and we stop the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + dgemm_("No transpose", "Transpose", &i__1, nrhs, kb, & + c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + tau[j] = 0.; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* Check if the submatrix A(I:M,K:N) contains Inf, */ +/* set INFO parameter to the column number, where */ +/* the first Inf is found plus N, and continue */ +/* the computation. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + if (*info == 0 && *maxc2nrmk > myhugeval) { + *info = *n + k - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third tolerance stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL.GE.ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig; */ + + *kb = k - 1; + if__ = i__ - 1; + +/* Apply the block reflector to the residual of the */ +/* matrix A and the residual of the right hand sides B, if */ +/* the residual matrix and and/or the residual of the right */ +/* hand sides exist, i.e. if the submatrix */ +/* A(I+1:M,KB+1:N+NRHS) exists. This occurs when */ +/* KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + dgemm_("No transpose", "Transpose", &i__1, &i__2, kb, & + c_b7, &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + + f_dim1], ldf, &c_b8, &a[if__ + 1 + (*kb + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + tau[j] = 0.; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,K:N): */ +/* 1) swap the K-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */ +/* 3) copy the K-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. (Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than K in the next loop step.) */ +/* 4) Save the pivot interchange with the indices relative to the */ +/* the original matrix A_orig, not the block A(1:M,1:N). */ + + if (kp != k) { + dswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + i__1 = k - 1; + dswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf); + vn1[kp] = vn1[k]; + vn2[kp] = vn2[k]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[k]; + jpiv[k] = itemp; + } + +/* Apply previous Householder reflectors to column K: */ +/* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T. */ + + if (k > 1) { + i__1 = *m - i__ + 1; + i__2 = k - 1; + dgemv_("No transpose", &i__1, &i__2, &c_b7, &a[i__ + a_dim1], lda, + &f[k + f_dim1], ldf, &c_b8, &a[i__ + k * a_dim1], &c__1); + } + +/* Generate elementary reflector H(k) using the column A(I:M,K). */ + + if (i__ < *m) { + i__1 = *m - i__ + 1; + dlarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], & + c__1, &tau[k]); + } else { + tau[k] = 0.; + } + +/* Check if TAU(K) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(K) for Inf, */ +/* since DLARFG cannot produce TAU(K) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by DLARFG can contain Inf, which requires */ +/* TAU(K) to contain NaN. Therefore, this case of generating Inf */ +/* by DLARFG is covered by checking TAU(K) for NaN. */ + + if (disnan_(&tau[k])) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *info = k; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = tau[k]; + *relmaxc2nrmk = tau[k]; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix contains NaN and we stop */ +/* the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + dgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &c_b7, & + a[if__ + 1 + a_dim1], lda, &f[*n + 1 + f_dim1], ldf, & + c_b8, &a[if__ + 1 + (*n + 1) * a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* =============================================================== */ + + aik = a[i__ + k * a_dim1]; + a[i__ + k * a_dim1] = 1.; + +/* =============================================================== */ + +/* Compute the current K-th column of F: */ +/* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K). */ + + if (k < *n + *nrhs) { + i__1 = *m - i__ + 1; + i__2 = *n + *nrhs - k; + dgemv_("Transpose", &i__1, &i__2, &tau[k], &a[i__ + (k + 1) * + a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b30, &f[k + + 1 + k * f_dim1], &c__1); + } + +/* 2) Zero out elements above and on the diagonal of the */ +/* column K in matrix F, i.e elements F(1:K,K). */ + + i__1 = k; + for (j = 1; j <= i__1; ++j) { + f[j + k * f_dim1] = 0.; + } + +/* 3) Incremental updating of the K-th column of F: */ +/* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T */ +/* * A(I:M,K). */ + + if (k > 1) { + i__1 = *m - i__ + 1; + i__2 = k - 1; + d__1 = -tau[k]; + dgemv_("Transpose", &i__1, &i__2, &d__1, &a[i__ + a_dim1], lda, & + a[i__ + k * a_dim1], &c__1, &c_b30, &auxv[1], &c__1); + + i__1 = *n + *nrhs; + i__2 = k - 1; + dgemv_("No transpose", &i__1, &i__2, &c_b8, &f[f_dim1 + 1], ldf, & + auxv[1], &c__1, &c_b8, &f[k * f_dim1 + 1], &c__1); + } + +/* =============================================================== */ + +/* Update the current I-th row of A: */ +/* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */ +/* - A(I,1:K)*F(K+1:N+NRHS,1:K)**T. */ + + if (k < *n + *nrhs) { + i__1 = *n + *nrhs - k; + dgemv_("No transpose", &i__1, &k, &c_b7, &f[k + 1 + f_dim1], ldf, + &a[i__ + a_dim1], lda, &c_b8, &a[i__ + (k + 1) * a_dim1], + lda); + } + + a[i__ + k * a_dim1] = aik; + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */ +/* when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */ + + if (k < minmnfact) { + + i__1 = *n; + for (j = k + 1; j <= i__1; ++j) { + if (vn1[j] != 0.) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + + temp = (d__1 = a[i__ + j * a_dim1], abs(d__1)) / vn1[j]; +/* Computing MAX */ + d__1 = 0., d__2 = (temp + 1.) * (1. - temp); + temp = f2cmax(d__1,d__2); +/* Computing 2nd power */ + d__1 = vn1[j] / vn2[j]; + temp2 = temp * (d__1 * d__1); + if (temp2 <= tol3z) { + +/* At J-index, we have a difficult column for the */ +/* update of the 2-norm. Save the index of the previous */ +/* difficult column in IWORK(J-1). */ +/* NOTE: ILSTCC > 1, threfore we can use IWORK only */ +/* with N-1 elements, where the elements are */ +/* shifted by 1 to the left. */ + + iwork[j - 1] = lsticc; + +/* Set the index of the last difficult column LSTICC. */ + + lsticc = j; + + } else { + vn1[j] *= sqrt(temp); + } + } + } + + } + +/* End of while loop. */ + + } + +/* Now, afler the loop: */ +/* Set KB, the number of factorized columns in the block; */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig, IF = IOFFSET + KB. */ + + *kb = k; + if__ = i__; + +/* Apply the block reflector to the residual of the matrix A */ +/* and the residual of the right hand sides B, if the residual */ +/* matrix and and/or the residual of the right hand sides */ +/* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */ +/* This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + dgemm_("No transpose", "Transpose", &i__1, &i__2, kb, &c_b7, &a[if__ + + 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b8, &a[if__ + + 1 + (*kb + 1) * a_dim1], lda); + } + +/* Recompute the 2-norm of the difficult columns. */ +/* Loop over the index of the difficult columns from the largest */ +/* to the smallest index. */ + + while(lsticc > 0) { + +/* LSTICC is the index of the last difficult column is greater */ +/* than 1. */ +/* ITEMP is the index of the previous difficult column. */ + + itemp = iwork[lsticc - 1]; + +/* Compute the 2-norm explicilty for the last difficult column and */ +/* save it in the partial and exact 2-norm vectors VN1 and VN2. */ + +/* NOTE: The computation of VN1( LSTICC ) relies on the fact that */ +/* DNRM2 does not fail on vectors with norm below the value of */ +/* SQRT(DLAMCH('S')) */ + + i__1 = *m - if__; + vn1[lsticc] = dnrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1); + vn2[lsticc] = vn1[lsticc]; + +/* Downdate the index of the last difficult column to */ +/* the index of the previous difficult column. */ + + lsticc = itemp; + + } + + return 0; + +/* End of DLAQP3RK */ + +} /* dlaqp3rk_ */ + diff --git a/lapack-netlib/SRC/dlaqp3rk.f b/lapack-netlib/SRC/dlaqp3rk.f new file mode 100644 index 000000000..8139345ed --- /dev/null +++ b/lapack-netlib/SRC/dlaqp3rk.f @@ -0,0 +1,935 @@ +*> \brief \b DLAQP3RK computes a step of truncated QR factorization with column pivoting of a real m-by-n matrix A using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DLAQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, +* $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, +* $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) +* IMPLICIT NONE +* LOGICAL DONE +* INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, +* $ NB, NRHS +* DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* +* .. Scalar Arguments .. +* LOGICAL DONE +* INTEGER KB, LDA, LDF, M, N, NB, NRHS, IOFFSET +* DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* DOUBLE PRECISION A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ), +* $ VN1( * ), VN2( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLAQP3RK computes a step of truncated QR factorization with column +*> pivoting of a real M-by-N matrix A block A(IOFFSET+1:M,1:N) +*> by using Level 3 BLAS as +*> +*> A * P(KB) = Q(KB) * R(KB). +*> +*> The routine tries to factorize NB columns from A starting from +*> the row IOFFSET+1 and updates the residual matrix with BLAS 3 +*> xGEMM. The number of actually factorized columns is returned +*> is smaller than NB. +*> +*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides B matrix stored +*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**T * B. +*> +*> Cases when the number of factorized columns KB < NB: +*> +*> (1) In some cases, due to catastrophic cancellations, it cannot +*> factorize all NB columns and need to update the residual matrix. +*> Hence, the actual number of factorized columns in the block returned +*> in KB is smaller than NB. The logical DONE is returned as FALSE. +*> The factorization of the whole original matrix A_orig must proceed +*> with the next block. +*> +*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. +*> +*> (3) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix is a zero matrix in some factorization +*> step KB, the factorization of the whole original matrix A_orig is +*> stopped, the logical DONE is returned as TRUE. The number of +*> factorized columns which is smaller than NB is returned in KB. +*> +*> (4) Whenever NaN is detected in the matrix A or in the array TAU, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. The INFO +*> parameter is set to the column index of the first NaN occurrence. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0 +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> Factorization block size, i.e the number of columns +*> to factorize in the matrix A. 0 <= NB +*> +*> If NB = 0, then the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine DGEQP3RK. 1 <= KP1 <= N_orig. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is DOUBLE PRECISION +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine DGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:KB) below +*> the diagonal together with the array TAU represent +*> the orthogonal matrix Q(KB) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:KB) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS). +*> The left part A(IOFFSET+1:M,KB+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(KB)**T. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] +*> \verbatim +*> DONE is LOGICAL +*> TRUE: a) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to ABSTOL +*> or RELTOL criterion, +*> b) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to the +*> residual matrix being a ZERO matrix. +*> c) when NaN was detected in the matrix A +*> or in the array TAU. +*> FALSE: otherwise. +*> \endverbatim +*> +*> \param[out] KB +*> \verbatim +*> KB is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= KB <= min(M-IOFFSET,NB,N). +*> +*> KB also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank KB. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank KB) to the maximum column 2-norm of the +*> original matrix A_orig. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is DOUBLE PRECISION array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is DOUBLE PRECISION array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is DOUBLE PRECISION array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] AUXV +*> \verbatim +*> AUXV is DOUBLE PRECISION array, dimension (NB) +*> Auxiliary vector. +*> \endverbatim +*> +*> \param[out] F +*> \verbatim +*> F is DOUBLE PRECISION array, dimension (LDF,NB) +*> Matrix F**T = L*(Y**T)*A. +*> \endverbatim +*> +*> \param[in] LDF +*> \verbatim +*> LDF is INTEGER +*> The leading dimension of the array F. LDF >= max(1,N+NRHS). +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step KB+1 ( when KB columns +*> have been factorized ). +*> +*> On exit: +*> KB is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(KB+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=KB+1, TAU(KB+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the actorization +*> step KB+1 ( when KB columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp3rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, + $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + LOGICAL DONE + INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, + $ NB, NRHS + DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + DOUBLE PRECISION A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ), + $ VN1( * ), VN2( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + INTEGER ITEMP, J, K, MINMNFACT, MINMNUPDT, + $ LSTICC, KP, I, IF + DOUBLE PRECISION AIK, HUGEVAL, TEMP, TEMP2, TOL3Z +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DGEMV, DLARFG, DSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL DISNAN + INTEGER IDAMAX + DOUBLE PRECISION DLAMCH, DNRM2 + EXTERNAL DISNAN, DLAMCH, IDAMAX, DNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + NB = MIN( NB, MINMNFACT ) + TOL3Z = SQRT( DLAMCH( 'Epsilon' ) ) + HUGEVAL = DLAMCH( 'Overflow' ) +* +* Compute factorization in a while loop over NB columns, +* K is the column index in the block A(1:M,1:N). +* + K = 0 + LSTICC = 0 + DONE = .FALSE. +* + DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 ) + K = K + 1 + I = IOFFSET + K +* + IF( I.EQ.1 ) THEN +* +* We are at the first column of the original whole matrix A_orig, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + KP = KP1 +* + ELSE +* +* Determine the pivot column in K-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( K-1 ) + IDAMAX( N-K+1, VN1( K ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,K:N) in step K. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains NaN, set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( DISNAN( MAXC2NRMK ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = KB + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. + + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL DGEMM( 'No transpose', 'Transpose', + $ M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* Quick return, if the submatrix A(I:M,K:N) is +* a zero matrix. We need to check it only if the column index +* (same as row index) is larger than 1, since the condition +* for the whole original matrix A_orig is checked in the main +* routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + RELMAXC2NRMK = ZERO +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix is zero and we stop the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL DGEMM( 'No transpose', 'Transpose', + $ M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. +* + DO J = K, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + K - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third tolerance stopping criteria. +* NOTE: There is no need to test for ABSTOL.GE.ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig; +* + KB = K - 1 + IF = I - 1 +* +* Apply the block reflector to the residual of the +* matrix A and the residual of the right hand sides B, if +* the residual matrix and and/or the residual of the right +* hand sides exist, i.e. if the submatrix +* A(I+1:M,KB+1:N+NRHS) exists. This occurs when +* KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL DGEMM( 'No transpose', 'Transpose', + $ M-IF, N+NRHS-KB, KB,-ONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. +* + DO J = K, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,K:N): +* 1) swap the K-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) +* 3) copy the K-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. (Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than K in the next loop step.) +* 4) Save the pivot interchange with the indices relative to the +* the original matrix A_orig, not the block A(1:M,1:N). +* + IF( KP.NE.K ) THEN + CALL DSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 ) + CALL DSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF ) + VN1( KP ) = VN1( K ) + VN2( KP ) = VN2( K ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( K ) + JPIV( K ) = ITEMP + END IF +* +* Apply previous Householder reflectors to column K: +* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T. +* + IF( K.GT.1 ) THEN + CALL DGEMV( 'No transpose', M-I+1, K-1, -ONE, A( I, 1 ), + $ LDA, F( K, 1 ), LDF, ONE, A( I, K ), 1 ) + END IF +* +* Generate elementary reflector H(k) using the column A(I:M,K). +* + IF( I.LT.M ) THEN + CALL DLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) ) + ELSE + TAU( K ) = ZERO + END IF +* +* Check if TAU(K) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(K) for Inf, +* since DLARFG cannot produce TAU(K) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by DLARFG can contain Inf, which requires +* TAU(K) to contain NaN. Therefore, this case of generating Inf +* by DLARFG is covered by checking TAU(K) for NaN. +* + IF( DISNAN( TAU(K) ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = K +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAU( K ) + RELMAXC2NRMK = TAU( K ) +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL DGEMM( 'No transpose', 'Transpose', + $ M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* =============================================================== +* + AIK = A( I, K ) + A( I, K ) = ONE +* +* =============================================================== +* +* Compute the current K-th column of F: +* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K). +* + IF( K.LT.N+NRHS ) THEN + CALL DGEMV( 'Transpose', M-I+1, N+NRHS-K, + $ TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1, + $ ZERO, F( K+1, K ), 1 ) + END IF +* +* 2) Zero out elements above and on the diagonal of the +* column K in matrix F, i.e elements F(1:K,K). +* + DO J = 1, K + F( J, K ) = ZERO + END DO +* +* 3) Incremental updating of the K-th column of F: +* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T +* * A(I:M,K). +* + IF( K.GT.1 ) THEN + CALL DGEMV( 'Transpose', M-I+1, K-1, -TAU( K ), + $ A( I, 1 ), LDA, A( I, K ), 1, ZERO, + $ AUXV( 1 ), 1 ) +* + CALL DGEMV( 'No transpose', N+NRHS, K-1, ONE, + $ F( 1, 1 ), LDF, AUXV( 1 ), 1, ONE, + $ F( 1, K ), 1 ) + END IF +* +* =============================================================== +* +* Update the current I-th row of A: +* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) +* - A(I,1:K)*F(K+1:N+NRHS,1:K)**T. +* + IF( K.LT.N+NRHS ) THEN + CALL DGEMV( 'No transpose', N+NRHS-K, K, -ONE, + $ F( K+1, 1 ), LDF, A( I, 1 ), LDA, ONE, + $ A( I, K+1 ), LDA ) + END IF +* + A( I, K ) = AIK +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. +* when K < MINMNFACT = min( M-IOFFSET, N ). +* + IF( K.LT.MINMNFACT ) THEN +* + DO J = K + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ABS( A( I, J ) ) / VN1( J ) + TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2.LE.TOL3Z ) THEN +* +* At J-index, we have a difficult column for the +* update of the 2-norm. Save the index of the previous +* difficult column in IWORK(J-1). +* NOTE: ILSTCC > 1, threfore we can use IWORK only +* with N-1 elements, where the elements are +* shifted by 1 to the left. +* + IWORK( J-1 ) = LSTICC +* +* Set the index of the last difficult column LSTICC. +* + LSTICC = J +* + ELSE + VN1( J ) = VN1( J )*SQRT( TEMP ) + END IF + END IF + END DO +* + END IF +* +* End of while loop. +* + END DO +* +* Now, afler the loop: +* Set KB, the number of factorized columns in the block; +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig, IF = IOFFSET + KB. +* + KB = K + IF = I +* +* Apply the block reflector to the residual of the matrix A +* and the residual of the right hand sides B, if the residual +* matrix and and/or the residual of the right hand sides +* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. +* This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL DGEMM( 'No transpose', 'Transpose', + $ M-IF, N+NRHS-KB, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* Recompute the 2-norm of the difficult columns. +* Loop over the index of the difficult columns from the largest +* to the smallest index. +* + DO WHILE( LSTICC.GT.0 ) +* +* LSTICC is the index of the last difficult column is greater +* than 1. +* ITEMP is the index of the previous difficult column. +* + ITEMP = IWORK( LSTICC-1 ) +* +* Compute the 2-norm explicilty for the last difficult column and +* save it in the partial and exact 2-norm vectors VN1 and VN2. +* +* NOTE: The computation of VN1( LSTICC ) relies on the fact that +* DNRM2 does not fail on vectors with norm below the value of +* SQRT(DLAMCH('S')) +* + VN1( LSTICC ) = DNRM2( M-IF, A( IF+1, LSTICC ), 1 ) + VN2( LSTICC ) = VN1( LSTICC ) +* +* Downdate the index of the last difficult column to +* the index of the previous difficult column. +* + LSTICC = ITEMP +* + END DO +* + RETURN +* +* End of DLAQP3RK +* + END diff --git a/lapack-netlib/SRC/dlaswlq.f b/lapack-netlib/SRC/dlaswlq.f index c95c94cbc..636c12dc8 100644 --- a/lapack-netlib/SRC/dlaswlq.f +++ b/lapack-netlib/SRC/dlaswlq.f @@ -99,19 +99,22 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) -*> +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= MB*M. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -159,33 +162,37 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup laswlq +*> * ===================================================================== SUBROUTINE DLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), T( LDT, *) + DOUBLE PRECISION A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL DGELQT, DTPLQT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -196,12 +203,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LT.0 ) THEN INFO = -4 @@ -209,60 +223,62 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = MB*M +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN - CALL DGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN + CALL DGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO ) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL DGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL DGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M, (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL DTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), - $ LDA, T(1, CTR * M + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL DTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + $ LDA, T(1, CTR * M + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL DTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), - $ LDA, T(1, CTR * M + 1), LDT, - $ WORK, INFO ) - END IF + $ LDA, T(1, CTR * M + 1), LDT, + $ WORK, INFO ) + END IF +* + WORK( 1 ) = LWMIN * - WORK( 1 ) = M * MB RETURN * * End of DLASWLQ diff --git a/lapack-netlib/SRC/dlatrs3.f b/lapack-netlib/SRC/dlatrs3.f index e6d78b672..d18675b2d 100644 --- a/lapack-netlib/SRC/dlatrs3.f +++ b/lapack-netlib/SRC/dlatrs3.f @@ -151,13 +151,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK). +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal size of *> WORK. *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -165,6 +169,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -181,7 +186,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -253,7 +258,7 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. @@ -292,15 +297,24 @@ * row. WORK( I+KK*LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = LWMIN * * Test the input parameters * @@ -322,7 +336,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN @@ -649,6 +663,9 @@ END IF END DO END DO +* + WORK( 1 ) = LWMIN +* RETURN * * End of DLATRS3 diff --git a/lapack-netlib/SRC/dlatsqr.f b/lapack-netlib/SRC/dlatsqr.f index 94a04be02..0000aab68 100644 --- a/lapack-netlib/SRC/dlatsqr.f +++ b/lapack-netlib/SRC/dlatsqr.f @@ -101,15 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= NB*N. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -161,27 +164,29 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup latsqr +*> * ===================================================================== SUBROUTINE DLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), T(LDT, *) + DOUBLE PRECISION A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME @@ -198,6 +203,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -205,65 +217,67 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = NB*N +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF +* IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL DGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF + IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL DGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF * - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL DGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CALL DGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) * - CTR = 1 - DO I = MB+1, II-MB+N , (MB-N) + CTR = 1 + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL DTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, - $ T(1, CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL DTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + $ T(1, CTR * N + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL DTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, - $ T(1, CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + IF( II.LE.M ) THEN + CALL DTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + $ T(1, CTR * N + 1), LDT, + $ WORK, INFO ) + END IF * - WORK( 1 ) = N*NB + WORK( 1 ) = LWMIN RETURN * * End of DLATSQR diff --git a/lapack-netlib/SRC/dsyev_2stage.f b/lapack-netlib/SRC/dsyev_2stage.f index 50d51d992..286366bfe 100644 --- a/lapack-netlib/SRC/dsyev_2stage.f +++ b/lapack-netlib/SRC/dsyev_2stage.f @@ -20,7 +20,7 @@ * Definition: * =========== * -* SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, +* SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, * INFO ) * * IMPLICIT NONE @@ -97,7 +97,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension LWORK +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -105,12 +105,12 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 2*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + 2*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -143,7 +143,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heev_2stage * *> \par Further Details: * ===================== @@ -161,7 +161,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -169,16 +169,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * * ===================================================================== - SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, + SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, $ INFO ) * IMPLICIT NONE @@ -305,7 +305,7 @@ LLWORK = LWORK - INDWRK + 1 * CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ), - $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, + $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, $ WORK( INDWRK ), LLWORK, IINFO ) * * For eigenvalues only, call DSTERF. For eigenvectors, first call diff --git a/lapack-netlib/SRC/dsyevd.f b/lapack-netlib/SRC/dsyevd.f index b27f4cdc7..adcfcb373 100644 --- a/lapack-netlib/SRC/dsyevd.f +++ b/lapack-netlib/SRC/dsyevd.f @@ -96,8 +96,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, -*> dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -160,7 +159,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevd * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/dsyevr.f b/lapack-netlib/SRC/dsyevr.f index 698691533..8647b0162 100644 --- a/lapack-netlib/SRC/dsyevr.f +++ b/lapack-netlib/SRC/dsyevr.f @@ -271,7 +271,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,26*N). +*> The dimension of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 26*N. *> For optimal efficiency, LWORK >= (NB+6)*N, *> where NB is the max of the blocksize for DSYTRD and DORMTR *> returned by ILAENV. @@ -285,13 +286,14 @@ *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) -*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK. +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. *> \endverbatim *> *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the IWORK array, @@ -315,7 +317,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevr * *> \par Contributors: * ================== @@ -390,8 +392,13 @@ * LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) ) * - LWMIN = MAX( 1, 26*N ) - LIWMIN = MAX( 1, 10*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 26*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -450,7 +457,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 7 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = A( 1, 1 ) diff --git a/lapack-netlib/SRC/dsyevr_2stage.f b/lapack-netlib/SRC/dsyevr_2stage.f index 09242bbd3..63d5e3159 100644 --- a/lapack-netlib/SRC/dsyevr_2stage.f +++ b/lapack-netlib/SRC/dsyevr_2stage.f @@ -263,7 +263,7 @@ *> indicating the nonzero elements in Z. The i-th eigenvector *> is nonzero only in elements ISUPPZ( 2*i-1 ) through *> ISUPPZ( 2*i ). This is an output of DSTEMR (tridiagonal -*> matrix). The support of the eigenvectors of A is typically +*> matrix). The support of the eigenvectors of A is typically *> 1:N because of the orthogonal transformations applied by DORMTR. *> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 *> \endverbatim @@ -277,12 +277,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 5*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + 5*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -300,13 +301,14 @@ *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) -*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK. +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. *> \endverbatim *> *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the IWORK array, @@ -330,7 +332,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevr_2stage * *> \par Contributors: * ================== @@ -358,7 +360,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -366,11 +368,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -444,8 +446,14 @@ IB = ILAENV2STAGE( 2, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -484,7 +492,7 @@ * NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) * NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) ) * LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN IWORK( 1 ) = LIWMIN END IF * @@ -504,7 +512,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 7 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = A( 1, 1 ) @@ -608,7 +616,7 @@ * Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form. * * - CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), + CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), $ WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ), $ LHTRD, WORK( INDWK ), LLWORK, IINFO ) * @@ -727,7 +735,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/dsyevx.f b/lapack-netlib/SRC/dsyevx.f index 99719874b..fd6a78e32 100644 --- a/lapack-netlib/SRC/dsyevx.f +++ b/lapack-netlib/SRC/dsyevx.f @@ -244,7 +244,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevx * * ===================================================================== SUBROUTINE DSYEVX( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, IL, IU, @@ -338,14 +338,14 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWKMIN = 1 - WORK( 1 ) = LWKMIN + LWKOPT = 1 ELSE LWKMIN = 8*N NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( LWKMIN, ( NB + 3 )*N ) - WORK( 1 ) = LWKOPT END IF + WORK( 1 ) = LWKOPT * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -17 diff --git a/lapack-netlib/SRC/dsysv_aa.f b/lapack-netlib/SRC/dsysv_aa.f index 8dab5a384..0a96ecd7e 100644 --- a/lapack-netlib/SRC/dsysv_aa.f +++ b/lapack-netlib/SRC/dsysv_aa.f @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYsolve +*> \ingroup hesv_aa * * ===================================================================== SUBROUTINE DSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -177,7 +177,7 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS + INTEGER LWKMIN, LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS * .. * .. External Functions .. LOGICAL LSAME @@ -196,6 +196,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -206,17 +207,17 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_SYTRF = INT( WORK(1) ) + LWKOPT_SYTRF = INT( WORK( 1 ) ) CALL DSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_SYTRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) + LWKOPT_SYTRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_SYTRF, LWKOPT_SYTRS ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/dsysv_aa_2stage.f b/lapack-netlib/SRC/dsysv_aa_2stage.f index 72fbe1e9a..90dd0a38a 100644 --- a/lapack-netlib/SRC/dsysv_aa_2stage.f +++ b/lapack-netlib/SRC/dsysv_aa_2stage.f @@ -101,14 +101,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is DOUBLE PRECISION array, dimension (LTB) +*> TB is DOUBLE PRECISION array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -148,14 +148,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION workspace of size LWORK +*> WORK is DOUBLE PRECISION workspace of size (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -179,7 +180,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYsolve +*> \ingroup hesv_aa_2stage * * ===================================================================== SUBROUTINE DSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, @@ -205,7 +206,7 @@ * * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKMIN, LWKOPT * .. * .. External Functions .. LOGICAL LSAME @@ -226,6 +227,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -234,18 +236,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN @@ -255,7 +258,6 @@ RETURN END IF * -* * Compute the factorization A = U**T*T*U or A = L*T*L**T. * CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, diff --git a/lapack-netlib/SRC/dsysvx.f b/lapack-netlib/SRC/dsysvx.f index a30831e72..b2b8210ca 100644 --- a/lapack-netlib/SRC/dsysvx.f +++ b/lapack-netlib/SRC/dsysvx.f @@ -275,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYsolve +*> \ingroup hesvx * * ===================================================================== SUBROUTINE DSYSVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B, @@ -305,7 +305,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKMIN, LWKOPT, NB DOUBLE PRECISION ANORM * .. * .. External Functions .. @@ -327,6 +327,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 3*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -344,12 +345,12 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 3*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 3*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'DSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) diff --git a/lapack-netlib/SRC/dsytrd.f b/lapack-netlib/SRC/dsytrd.f index 3dcfc3db2..58d4b633b 100644 --- a/lapack-netlib/SRC/dsytrd.f +++ b/lapack-netlib/SRC/dsytrd.f @@ -139,7 +139,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrd * *> \par Further Details: * ===================== @@ -247,7 +247,7 @@ * Determine the block size. * NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/dsytrd_2stage.f b/lapack-netlib/SRC/dsytrd_2stage.f index 8ae77d3e4..a88ac1c73 100644 --- a/lapack-netlib/SRC/dsytrd_2stage.f +++ b/lapack-netlib/SRC/dsytrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download DSYTRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download DSYTRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * DOUBLE PRECISION A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the orthogonal *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is DOUBLE PRECISION array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is DOUBLE PRECISION array, dimension (LHOUS2) +*> HOUS2 is DOUBLE PRECISION array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -143,23 +145,26 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +182,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +207,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +215,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -265,10 +270,13 @@ * KD = ILAENV2STAGE( 1, 'DSYTRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'DSYTRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'DSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -309,14 +317,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRD_SY2SB', -INFO ) RETURN END IF - CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRD_SB2ST', -INFO ) @@ -324,8 +332,7 @@ END IF * * - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN RETURN * * End of DSYTRD_2STAGE diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F index bb74dd491..04d03d587 100644 --- a/lapack-netlib/SRC/dsytrd_sb2st.F +++ b/lapack-netlib/SRC/dsytrd_sb2st.F @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, +* SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, * D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * #if defined(_OPENMP) @@ -53,12 +53,12 @@ *> \param[in] STAGE1 *> \verbatim *> STAGE1 is CHARACTER*1 -*> = 'N': "No": to mention that the stage 1 of the reduction +*> = 'N': "No": to mention that the stage 1 of the reduction *> from dense to band using the dsytrd_sy2sb routine -*> was not called before this routine to reproduce AB. -*> In other term this routine is called as standalone. -*> = 'Y': "Yes": to mention that the stage 1 of the -*> reduction from dense to band using the dsytrd_sy2sb +*> was not called before this routine to reproduce AB. +*> In other term this routine is called as standalone. +*> = 'Y': "Yes": to mention that the stage 1 of the +*> reduction from dense to band using the dsytrd_sy2sb *> routine has been called to produce AB (e.g., AB is *> the output of dsytrd_sy2sb. *> \endverbatim @@ -66,10 +66,10 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> and thus LHOUS is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate or to apply Q later on, +*> = 'V': the Householder representation is needed to +*> either generate or to apply Q later on, *> then LHOUS is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -132,34 +132,39 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is DOUBLE PRECISION array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is DOUBLE PRECISION array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error *> message related to LHOUS is issued by XERBLA. *> LHOUS = MAX(1, dimension) where *> dimension = 4*N if VECT='N' -*> not available now if VECT='H' +*> not available now if VECT='H' *> \endverbatim *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension LWORK. +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -188,7 +193,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup real16OTHERcomputational +*> \ingroup hetrd_hb2st * *> \par Further Details: * ===================== @@ -208,7 +213,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -216,16 +221,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, + SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * #if defined(_OPENMP) @@ -258,11 +263,11 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 - INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, + INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, - $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, + $ NBTILES, TTYPE, TID, NTHREADS, + $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SIDEV, SIZETAU, LDV, LHMIN, LWMIN * .. @@ -274,7 +279,7 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV2STAGE + INTEGER ILAENV2STAGE EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. @@ -282,7 +287,6 @@ * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -291,9 +295,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -355,7 +364,7 @@ ABDPOS = KD + 1 ABOFDPOS = KD ELSE - APOS = INDA + APOS = INDA AWPOS = INDA + KD + 1 DPOS = APOS OFDPOS = DPOS + 1 @@ -363,11 +372,11 @@ ABOFDPOS = 2 ENDIF -* -* Case KD=0: -* The matrix is diagonal. We just copy it (convert to "real" for -* real because D is double and the imaginary part should be 0) -* and store it in D. A sequential code here is better or +* +* Case KD=0: +* The matrix is diagonal. We just copy it (convert to "real" for +* real because D is double and the imaginary part should be 0) +* and store it in D. A sequential code here is better or * in a parallel environment it might need two cores for D and E * IF( KD.EQ.0 ) THEN @@ -382,17 +391,17 @@ WORK( 1 ) = 1 RETURN END IF -* -* Case KD=1: -* The matrix is already Tridiagonal. We have to make diagonal +* +* Case KD=1: +* The matrix is already Tridiagonal. We have to make diagonal * and offdiagonal elements real, and store them in D and E. -* For that, for real precision just copy the diag and offdiag -* to D and E while for the COMPLEX case the bulge chasing is -* performed to convert the hermetian tridiagonal to symmetric -* tridiagonal. A simpler conversion formula might be used, but then +* For that, for real precision just copy the diag and offdiag +* to D and E while for the COMPLEX case the bulge chasing is +* performed to convert the hermetian tridiagonal to symmetric +* tridiagonal. A simpler conversion formula might be used, but then * updating the Q matrix will be required and based if Q is generated -* or not this might complicate the story. -* +* or not this might complicate the story. +* IF( KD.EQ.1 ) THEN DO 50 I = 1, N D( I ) = ( AB( ABDPOS, I ) ) @@ -413,7 +422,7 @@ RETURN END IF * -* Main code start here. +* Main code start here. * Reduce the symmetric band of A to a tridiagonal matrix. * THGRSIZ = N @@ -422,7 +431,7 @@ NBTILES = CEILING( REAL(N)/REAL(KD) ) STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) -* +* CALL DLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) CALL DLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) * @@ -431,7 +440,7 @@ * #if defined(_OPENMP) !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) -!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) +!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) !$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) !$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) !$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) @@ -440,7 +449,7 @@ #endif * * main bulge chasing loop -* +* DO 100 THGRID = 1, THGRNB STT = (THGRID-1)*THGRSIZ+1 THED = MIN( (STT + THGRSIZ -1), (N-1)) @@ -451,7 +460,7 @@ ST = STT DO 130 SWEEPID = ST, ED DO 140 K = 1, GRSIZ - MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) + MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) $ + (M-1)*GRSIZ + K IF ( MYID.EQ.1 ) THEN TTYPE = 1 @@ -477,16 +486,16 @@ ENDIF * * Call the kernel -* +* #if defined(_OPENMP) && _OPENMP >= 201307 - IF( TTYPE.NE.1 ) THEN + IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK @@ -494,20 +503,20 @@ !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK ENDIF #else - CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW ) ) -#endif +#endif IF ( BLKLASTIND.GE.(N-1) ) THEN STT = STT + 1 EXIT @@ -522,14 +531,14 @@ !$OMP END MASTER !$OMP END PARALLEL #endif -* +* * Copy the diagonal from A to D. Note that D is REAL thus only * the Real part is needed, the imaginary part should be zero. * DO 150 I = 1, N D( I ) = ( WORK( DPOS+(I-1)*LDA ) ) 150 CONTINUE -* +* * Copy the off diagonal from A to E. Note that E is REAL thus only * the Real part is needed, the imaginary part should be zero. * @@ -543,11 +552,10 @@ 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN WORK( 1 ) = LWMIN RETURN * * End of DSYTRD_SB2ST * END - + diff --git a/lapack-netlib/SRC/dsytrd_sy2sb.f b/lapack-netlib/SRC/dsytrd_sy2sb.f index 1660b5c7e..38acc71f1 100644 --- a/lapack-netlib/SRC/dsytrd_sy2sb.f +++ b/lapack-netlib/SRC/dsytrd_sy2sb.f @@ -123,8 +123,8 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY) +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -158,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrd_he2hb * *> \par Further Details: * ===================== @@ -293,8 +295,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', ' ', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/dsytrf.f b/lapack-netlib/SRC/dsytrf.f index aee9b3f6a..2a1a2d4dc 100644 --- a/lapack-netlib/SRC/dsytrf.f +++ b/lapack-netlib/SRC/dsytrf.f @@ -107,7 +107,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -135,7 +135,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf * *> \par Further Details: * ===================== @@ -352,6 +352,7 @@ END IF * 40 CONTINUE +* WORK( 1 ) = LWKOPT RETURN * diff --git a/lapack-netlib/SRC/dsytrf_aa.f b/lapack-netlib/SRC/dsytrf_aa.f index 9a0b26ce5..924d4c165 100644 --- a/lapack-netlib/SRC/dsytrf_aa.f +++ b/lapack-netlib/SRC/dsytrf_aa.f @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= MAX(1,2*N). For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -125,10 +127,10 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -152,7 +154,7 @@ * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB DOUBLE PRECISION ALPHA * .. @@ -179,18 +181,25 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N WORK( 1 ) = LWKOPT END IF * @@ -203,11 +212,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN RETURN END IF * diff --git a/lapack-netlib/SRC/dsytrf_aa_2stage.f b/lapack-netlib/SRC/dsytrf_aa_2stage.f index c65bd86e6..fae95bab2 100644 --- a/lapack-netlib/SRC/dsytrf_aa_2stage.f +++ b/lapack-netlib/SRC/dsytrf_aa_2stage.f @@ -87,14 +87,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is DOUBLE PRECISION array, dimension (LTB) +*> TB is DOUBLE PRECISION array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION workspace of size LWORK +*> WORK is DOUBLE PRECISION workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used +*> to select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -211,9 +211,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -227,10 +227,10 @@ NB = ILAENV( 1, 'DSYTRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = MAX( 1, (3*NB+1)*N ) END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = MAX( 1, N*NB ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -239,7 +239,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * diff --git a/lapack-netlib/SRC/dsytrf_rk.f b/lapack-netlib/SRC/dsytrf_rk.f index 086586968..0717eb076 100644 --- a/lapack-netlib/SRC/dsytrf_rk.f +++ b/lapack-netlib/SRC/dsytrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension ( MAX(1,LWORK) ). +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -229,7 +229,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_rk * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/dsytrf_rook.f b/lapack-netlib/SRC/dsytrf_rook.f index 2f00d1802..316663485 100644 --- a/lapack-netlib/SRC/dsytrf_rook.f +++ b/lapack-netlib/SRC/dsytrf_rook.f @@ -118,7 +118,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -146,7 +146,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_rook * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/dsytri2.f b/lapack-netlib/SRC/dsytri2.f index dbcdcdb58..5960d3992 100644 --- a/lapack-netlib/SRC/dsytri2.f +++ b/lapack-netlib/SRC/dsytri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (N+NB+1)*(NB+3) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE DSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -159,9 +159,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'DSYTRI2', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +177,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = MINSIZE RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - IF( NBMAX .GE. N ) THEN + IF( NBMAX.GE.N ) THEN CALL DSYTRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL DSYTRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of DSYTRI2 diff --git a/lapack-netlib/SRC/dsytri_3.f b/lapack-netlib/SRC/dsytri_3.f index 86d69cdfd..50834c605 100644 --- a/lapack-netlib/SRC/dsytri_3.f +++ b/lapack-netlib/SRC/dsytri_3.f @@ -119,16 +119,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (N+NB+1)*(NB+3). +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= (N+NB+1)*(NB+3). +*> The length of WORK. +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> -*> If LDWORK = -1, then a workspace query is assumed; +*> If LWORK = -1, then a workspace query is assumed; *> the routine only calculates the optimal size of the optimal *> size of the WORK array, returns this value as the first *> entry of the WORK array, and no error message related to @@ -152,7 +153,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetri_3 * *> \par Contributors: * ================== @@ -208,8 +209,13 @@ * * Determine the block size * - NB = MAX( 1, ILAENV( 1, 'DSYTRI_3', UPLO, N, -1, -1, -1 ) ) - LWKOPT = ( N+NB+1 ) * ( NB+3 ) + IF( N.EQ.0 ) THEN + LWKOPT = 1 + ELSE + NB = MAX( 1, ILAENV( 1, 'DSYTRI_3', UPLO, N, -1, -1, -1 ) ) + LWKOPT = ( N+NB+1 ) * ( NB+3 ) + END IF + WORK( 1 ) = LWKOPT * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 @@ -217,7 +223,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN INFO = -8 END IF * @@ -225,7 +231,6 @@ CALL XERBLA( 'DSYTRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWKOPT RETURN END IF * diff --git a/lapack-netlib/SRC/dsytrs_aa.f b/lapack-netlib/SRC/dsytrs_aa.f index 26b11a2a0..f0016cb7f 100644 --- a/lapack-netlib/SRC/dsytrs_aa.f +++ b/lapack-netlib/SRC/dsytrs_aa.f @@ -105,7 +105,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -123,7 +129,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrs_aa * * ===================================================================== SUBROUTINE DSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, @@ -151,7 +157,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME @@ -161,13 +167,19 @@ EXTERNAL DLACPY, DGTSV, DSWAP, DTRSM, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -178,21 +190,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKMIN RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN diff --git a/lapack-netlib/SRC/ilaenv.c b/lapack-netlib/SRC/ilaenv.c index c47224a0c..8f3b2db8e 100644 --- a/lapack-netlib/SRC/ilaenv.c +++ b/lapack-netlib/SRC/ilaenv.c @@ -191,7 +191,7 @@ typedef struct Namelist Namelist; #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER #define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -252,11 +252,11 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} #define myexit_() break; -#define mycycle() continue; -#define myceiling(w) {ceil(w)} -#define myhuge(w) {HUGE_VAL} +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} -#define mymaxloc(w,s,e,n) {dmaxloc_(w,*(s),*(e),n)} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) /* procedure parameter types for -A and -C++ */ @@ -509,12 +509,18 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + /* Table of constant values */ static integer c__1 = 1; -static real c_b174 = 0.f; -static real c_b175 = 1.f; +static real c_b179 = 0.f; +static real c_b180 = 1.f; static integer c__0 = 0; /* > \brief \b ILAENV */ @@ -599,9 +605,9 @@ f"> */ /* > = 9: maximum size of the subproblems at the bottom of the */ /* > computation tree in the divide-and-conquer algorithm */ /* > (used by xGELSD and xGESDD) */ -/* > =10: ieee NaN arithmetic can be trusted not to trap */ +/* > =10: ieee infinity and NaN arithmetic can be trusted not to trap */ /* > =11: infinity arithmetic can be trusted not to trap */ -/* > 12 <= ISPEC <= 16: */ +/* > 12 <= ISPEC <= 17: */ /* > xHSEQR or related subroutines, */ /* > see IPARMQ for detailed explanation */ /* > \endverbatim */ @@ -652,9 +658,7 @@ f"> */ /* > \author Univ. of Colorado Denver */ /* > \author NAG Ltd. */ -/* > \date November 2019 */ - -/* > \ingroup OTHERauxiliary */ +/* > \ingroup ilaenv */ /* > \par Further Details: */ /* ===================== */ @@ -685,7 +689,7 @@ integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1, opts_len) { /* System generated locals */ - integer ret_val; + integer ret_val, i__1, i__2, i__3; /* Local variables */ logical twostage; @@ -702,10 +706,9 @@ integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1, integer *, integer *); -/* -- LAPACK auxiliary routine (version 3.9.0) -- */ +/* -- LAPACK auxiliary routine -- */ /* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ /* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ -/* November 2019 */ /* ===================================================================== */ @@ -728,6 +731,7 @@ integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1, case 14: goto L160; case 15: goto L160; case 16: goto L160; + case 17: goto L160; } /* Invalid value for ISPEC */ @@ -908,6 +912,12 @@ L50: } else { nb = 64; } + } else if (s_cmp(subnam + 3, "QP3RK", (ftnlen)4, (ftnlen)5) == 0) { + if (sname) { + nb = 32; + } else { + nb = 32; + } } } else if (s_cmp(c2, "PO", (ftnlen)2, (ftnlen)2) == 0) { if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) { @@ -1034,6 +1044,21 @@ L50: } else { nb = 64; } + } else if (s_cmp(c3, "SYL", (ftnlen)3, (ftnlen)3) == 0) { +/* The upper bound is to prevent overly aggressive scaling. */ + if (sname) { +/* Computing MIN */ +/* Computing MAX */ + i__2 = 48, i__3 = (f2cmin(*n1,*n2) << 4) / 100; + i__1 = f2cmax(i__2,i__3); + nb = f2cmin(i__1,240); + } else { +/* Computing MIN */ +/* Computing MAX */ + i__2 = 24, i__3 = (f2cmin(*n1,*n2) << 3) / 100; + i__1 = f2cmax(i__2,i__3); + nb = f2cmin(i__1,80); + } } } else if (s_cmp(c2, "LA", (ftnlen)2, (ftnlen)2) == 0) { if (s_cmp(c3, "UUM", (ftnlen)3, (ftnlen)3) == 0) { @@ -1042,6 +1067,12 @@ L50: } else { nb = 64; } + } else if (s_cmp(c3, "TRS", (ftnlen)3, (ftnlen)3) == 0) { + if (sname) { + nb = 32; + } else { + nb = 32; + } } } else if (sname && s_cmp(c2, "ST", (ftnlen)2, (ftnlen)2) == 0) { if (s_cmp(c3, "EBZ", (ftnlen)3, (ftnlen)3) == 0) { @@ -1093,6 +1124,12 @@ L60: } else { nbmin = 2; } + } else if (s_cmp(subnam + 3, "QP3RK", (ftnlen)4, (ftnlen)5) == 0) { + if (sname) { + nbmin = 2; + } else { + nbmin = 2; + } } } else if (s_cmp(c2, "SY", (ftnlen)2, (ftnlen)2) == 0) { if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) { @@ -1184,6 +1221,12 @@ L70: } else { nx = 128; } + } else if (s_cmp(subnam + 3, "QP3RK", (ftnlen)4, (ftnlen)5) == 0) { + if (sname) { + nx = 128; + } else { + nx = 128; + } } } else if (s_cmp(c2, "SY", (ftnlen)2, (ftnlen)2) == 0) { if (sname && s_cmp(c3, "TRD", (ftnlen)3, (ftnlen)3) == 0) { @@ -1270,29 +1313,29 @@ L130: L140: -/* ISPEC = 10: ieee NaN arithmetic can be trusted not to trap */ +/* ISPEC = 10: ieee and infinity NaN arithmetic can be trusted not to trap */ /* ILAENV = 0 */ ret_val = 1; if (ret_val == 1) { - ret_val = ieeeck_(&c__1, &c_b174, &c_b175); + ret_val = ieeeck_(&c__1, &c_b179, &c_b180); } return ret_val; L150: -/* ISPEC = 11: infinity arithmetic can be trusted not to trap */ +/* ISPEC = 11: ieee infinity arithmetic can be trusted not to trap */ /* ILAENV = 0 */ ret_val = 1; if (ret_val == 1) { - ret_val = ieeeck_(&c__0, &c_b174, &c_b175); + ret_val = ieeeck_(&c__0, &c_b179, &c_b180); } return ret_val; L160: -/* 12 <= ISPEC <= 16: xHSEQR or related subroutines. */ +/* 12 <= ISPEC <= 17: xHSEQR or related subroutines. */ ret_val = iparmq_(ispec, name__, opts, n1, n2, n3, n4) ; diff --git a/lapack-netlib/SRC/ilaenv.f b/lapack-netlib/SRC/ilaenv.f index a639e0375..e74a2b35e 100644 --- a/lapack-netlib/SRC/ilaenv.f +++ b/lapack-netlib/SRC/ilaenv.f @@ -132,7 +132,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup OTHERauxiliary +*> \ingroup ilaenv * *> \par Further Details: * ===================== @@ -355,6 +355,12 @@ ELSE NB = 64 END IF + ELSE IF( SUBNAM( 4: 7 ).EQ.'QP3RK' ) THEN + IF( SNAME ) THEN + NB = 32 + ELSE + NB = 32 + END IF END IF ELSE IF( C2.EQ.'PO' ) THEN IF( C3.EQ.'TRF' ) THEN @@ -541,7 +547,14 @@ ELSE NBMIN = 2 END IF + ELSE IF( SUBNAM( 4: 7 ).EQ.'QP3RK' ) THEN + IF( SNAME ) THEN + NBMIN = 2 + ELSE + NBMIN = 2 + END IF END IF + ELSE IF( C2.EQ.'SY' ) THEN IF( C3.EQ.'TRF' ) THEN IF( SNAME ) THEN @@ -618,6 +631,12 @@ ELSE NX = 128 END IF + ELSE IF( SUBNAM( 4: 7 ).EQ.'QP3RK' ) THEN + IF( SNAME ) THEN + NX = 128 + ELSE + NX = 128 + END IF END IF ELSE IF( C2.EQ.'SY' ) THEN IF( SNAME .AND. C3.EQ.'TRD' ) THEN diff --git a/lapack-netlib/SRC/sbbcsd.f b/lapack-netlib/SRC/sbbcsd.f index d5720cb33..2a619cb71 100644 --- a/lapack-netlib/SRC/sbbcsd.f +++ b/lapack-netlib/SRC/sbbcsd.f @@ -805,7 +805,7 @@ CALL SLARTGP( B22BULGE, B22E(I-1), WORK(IU2SN+I-1), $ WORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL SLARTGS( B21E(I), B21E(I+1), NU, WORK(IU2CS+I-1), + CALL SLARTGS( B21E(I), B21D(I+1), NU, WORK(IU2CS+I-1), $ WORK(IU2SN+I-1) ) ELSE CALL SLARTGS( B22D(I), B22E(I), MU, WORK(IU2CS+I-1), diff --git a/lapack-netlib/SRC/sgebrd.f b/lapack-netlib/SRC/sgebrd.f index 2d0c6d651..b33ad0b1f 100644 --- a/lapack-netlib/SRC/sgebrd.f +++ b/lapack-netlib/SRC/sgebrd.f @@ -122,7 +122,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -223,8 +224,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL SGEBD2, SGEMM, SLABRD, XERBLA @@ -242,9 +243,16 @@ * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + ENDIF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -252,7 +260,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -264,7 +272,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -283,7 +290,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using @@ -342,7 +349,8 @@ * CALL SGEBD2( M-I+1, N-I+1, A( I, I ), LDA, D( I ), E( I ), $ TAUQ( I ), TAUP( I ), WORK, IINFO ) - WORK( 1 ) = SROUNDUP_LWORK(WS) +* + WORK( 1 ) = SROUNDUP_LWORK( WS ) RETURN * * End of SGEBRD diff --git a/lapack-netlib/SRC/sgedmd.f90 b/lapack-netlib/SRC/sgedmd.f90 index 49cb11527..4860e8898 100644 --- a/lapack-netlib/SRC/sgedmd.f90 +++ b/lapack-netlib/SRC/sgedmd.f90 @@ -1,423 +1,573 @@ - SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & - M, N, X, LDX, Y, LDY, NRNK, TOL, & - K, REIG, IMEIG, Z, LDZ, RES, & - B, LDB, W, LDW, S, LDS, & - WORK, LWORK, IWORK, LIWORK, INFO ) -! March 2023 +!> \brief \b SGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== +! +! SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, REIG, IMEIG, Z, LDZ, RES, & +! B, LDB, W, LDW, S, LDS, & +! WORK, LWORK, IWORK, LIWORK, INFO ) !..... - USE iso_fortran_env - IMPLICIT NONE - INTEGER, PARAMETER :: WP = real32 +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real32 !..... ! Scalar arguments - CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF - INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & - NRNK, LDZ, LDB, LDW, LDS, & - LWORK, LIWORK - INTEGER, INTENT(OUT) :: K, INFO - REAL(KIND=WP), INTENT(IN) :: TOL +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LWORK, LIWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL ! Array arguments - REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) - REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & - W(LDW,*), S(LDS,*) - REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & - RES(*) - REAL(KIND=WP), INTENT(OUT) :: WORK(*) - INTEGER, INTENT(OUT) :: IWORK(*) -!............................................................ -! Purpose -! ======= -! SGEDMD computes the Dynamic Mode Decomposition (DMD) for -! a pair of data snapshot matrices. For the input matrices -! X and Y such that Y = A*X with an unaccessible matrix -! A, SGEDMD computes a certain number of Ritz pairs of A using -! the standard Rayleigh-Ritz extraction from a subspace of -! range(X) that is determined using the leading left singular -! vectors of X. Optionally, SGEDMD returns the residuals -! of the computed Ritz pairs, the information needed for -! a refinement of the Ritz vectors, or the eigenvectors of -! the Exact DMD. -! For further details see the references listed -! below. For more details of the implementation see [3]. -! -! References -! ========== -! [1] P. Schmid: Dynamic mode decomposition of numerical -! and experimental data, -! Journal of Fluid Mechanics 656, 5-28, 2010. -! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal -! decompositions: analysis and enhancements, -! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. -! [3] Z. Drmac: A LAPACK implementation of the Dynamic -! Mode Decomposition I. Technical report. AIMDyn Inc. -! and LAPACK Working Note 298. -! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. -! Brunton, N. Kutz: On Dynamic Mode Decomposition: -! Theory and Applications, Journal of Computational -! Dynamics 1(2), 391 -421, 2014. +! REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & +! RES(*) +! REAL(KIND=WP), INTENT(OUT) :: WORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) ! +!............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> SGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, SGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, SGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim !...................................................................... -! Developed and supported by: -! =========================== -! Developed and coded by Zlatko Drmac, Faculty of Science, -! University of Zagreb; drmac@math.hr -! In cooperation with -! AIMdyn Inc., Santa Barbara, CA. -! and supported by -! - DARPA SBIR project "Koopman Operator-Based Forecasting -! for Nonstationary Processes from Near-Term, Limited -! Observational Data" Contract No: W31P4Q-21-C-0007 -! - DARPA PAI project "Physics-Informed Machine Learning -! Methodologies" Contract No: HR0011-18-9-0033 -! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic -! Framework for Space-Time Analysis of Process Dynamics" -! Contract No: HR0011-16-C-0116 -! Any opinions, findings and conclusions or recommendations -! expressed in this material are those of the author and -! do not necessarily reflect the views of the DARPA SBIR -! Program Office -!============================================================ -! Distribution Statement A: -! Approved for Public Release, Distribution Unlimited. -! Cleared by DARPA on September 29, 2022 -!============================================================ +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim !...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Distribution Statement A: +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim +!============================================================ ! Arguments ! ========= -! JOBS (input) CHARACTER*1 -! Determines whether the initial data snapshots are scaled -! by a diagonal matrix. -! 'S' :: The data snapshots matrices X and Y are multiplied -! with a diagonal matrix D so that X*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'C' :: The snapshots are scaled as with the 'S' option. -! If it is found that an i-th column of X is zero -! vector and the corresponding i-th column of Y is -! non-zero, then the i-th column of Y is set to -! zero and a warning flag is raised. -! 'Y' :: The data snapshots matrices X and Y are multiplied -! by a diagonal matrix D so that Y*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'N' :: No data scaling. +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim !..... -! JOBZ (input) CHARACTER*1 -! Determines whether the eigenvectors (Koopman modes) will -! be computed. -! 'V' :: The eigenvectors (Koopman modes) will be computed -! and returned in the matrix Z. -! See the description of Z. -! 'F' :: The eigenvectors (Koopman modes) will be returned -! in factored form as the product X(:,1:K)*W, where X -! contains a POD basis (leading left singular vectors -! of the data matrix X) and W contains the eigenvectors -! of the corresponding Rayleigh quotient. -! See the descriptions of K, X, W, Z. -! 'N' :: The eigenvectors are not computed. +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim !..... -! JOBR (input) CHARACTER*1 -! Determines whether to compute the residuals. -! 'R' :: The residuals for the computed eigenpairs will be -! computed and stored in the array RES. -! See the description of RES. -! For this option to be legal, JOBZ must be 'V'. -! 'N' :: The residuals are not computed. +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim !..... -! JOBF (input) CHARACTER*1 -! Specifies whether to store information needed for post- -! processing (e.g. computing refined Ritz vectors) -! 'R' :: The matrix needed for the refinement of the Ritz -! vectors is computed and stored in the array B. -! See the description of B. -! 'E' :: The unscaled eigenvectors of the Exact DMD are -! computed and returned in the array B. See the -! description of B. -! 'N' :: No eigenvector refinement data is computed. +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim !..... -! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } -! Allows for a selection of the SVD algorithm from the -! LAPACK library. -! 1 :: SGESVD (the QR SVD algorithm) -! 2 :: SGESDD (the Divide and Conquer algorithm; if enough -! workspace available, this is the fastest option) -! 3 :: SGESVDQ (the preconditioned QR SVD ; this and 4 -! are the most accurate options) -! 4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 -! are the most accurate options) -! For the four methods above, a significant difference in -! the accuracy of small singular values is possible if -! the snapshots vary in norm so that X is severely -! ill-conditioned. If small (smaller than EPS*||X||) -! singular values are of interest and JOBS=='N', then -! the options (3, 4) give the most accurate results, where -! the option 4 is slightly better and with stronger -! theoretical background. -! If JOBS=='S', i.e. the columns of X will be normalized, -! then all methods give nearly equally accurate results. +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: SGESVD (the QR SVD algorithm) +!> 2 :: SGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: SGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim !..... -! M (input) INTEGER, M>= 0 -! The state space dimension (the row dimension of X, Y). +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim !..... -! N (input) INTEGER, 0 <= N <= M -! The number of data snapshot pairs -! (the number of columns of X and Y). +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim !..... -! X (input/output) REAL(KIND=WP) M-by-N array -! > On entry, X contains the data snapshot matrix X. It is -! assumed that the column norms of X are in the range of -! the normalized floating point numbers. -! < On exit, the leading K columns of X contain a POD basis, -! i.e. the leading K left singular vectors of the input -! data matrix X, U(:,1:K). All N columns of X contain all -! left singular vectors of the input matrix X. -! See the descriptions of K, Z and W. +!> \param[in,out] X +!> \verbatim +!> X (input/output) REAL(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. +!> \endverbatim !..... -! LDX (input) INTEGER, LDX >= M -! The leading dimension of the array X. +!> \param[in] LDX +!> \verbatim +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim !..... -! Y (input/workspace/output) REAL(KIND=WP) M-by-N array -! > On entry, Y contains the data snapshot matrix Y -! < On exit, -! If JOBR == 'R', the leading K columns of Y contain -! the residual vectors for the computed Ritz pairs. -! See the description of RES. -! If JOBR == 'N', Y contains the original input data, -! scaled according to the value of JOBS. +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) REAL(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim !..... -! LDY (input) INTEGER , LDY >= M -! The leading dimension of the array Y. +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim !..... -! NRNK (input) INTEGER -! Determines the mode how to compute the numerical rank, -! i.e. how to truncate small singular values of the input -! matrix X. On input, if -! NRNK = -1 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(1) -! This option is recommended. -! NRNK = -2 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(i-1) -! This option is included for R&D purposes. -! It requires highly accurate SVD, which -! may not be feasible. -! The numerical rank can be enforced by using positive -! value of NRNK as follows: -! 0 < NRNK <= N :: at most NRNK largest singular values -! will be used. If the number of the computed nonzero -! singular values is less than NRNK, then only those -! nonzero values will be used and the actually used -! dimension is less than NRNK. The actual number of -! the nonzero singular values is returned in the variable -! K. See the descriptions of TOL and K. +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1) +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim !..... -! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 -! The tolerance for truncating small singular values. -! See the description of NRNK. +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim !..... -! K (output) INTEGER, 0 <= K <= N -! The dimension of the POD basis for the data snapshot -! matrix X and the number of the computed Ritz pairs. -! The value of K is determined according to the rule set -! by the parameters NRNK and TOL. -! See the descriptions of NRNK and TOL. +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim !..... -! REIG (output) REAL(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of REIG contain -! the real parts of the computed eigenvalues -! REIG(1:K) + sqrt(-1)*IMEIG(1:K). -! See the descriptions of K, IMEIG, and Z. +!> \param[out] REIG +!> \verbatim +!> REIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of REIG contain +!> the real parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> See the descriptions of K, IMEIG, and Z. +!> \endverbatim !..... -! IMEIG (output) REAL(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of IMEIG contain -! the imaginary parts of the computed eigenvalues -! REIG(1:K) + sqrt(-1)*IMEIG(1:K). -! The eigenvalues are determined as follows: -! If IMEIG(i) == 0, then the corresponding eigenvalue is -! real, LAMBDA(i) = REIG(i). -! If IMEIG(i)>0, then the corresponding complex -! conjugate pair of eigenvalues reads -! LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) -! LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) -! That is, complex conjugate pairs have consecutive -! indices (i,i+1), with the positive imaginary part -! listed first. -! See the descriptions of K, REIG, and Z. +!> \param[out] IMEIG +!> \verbatim +!> IMEIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of IMEIG contain +!> the imaginary parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> The eigenvalues are determined as follows: +!> If IMEIG(i) == 0, then the corresponding eigenvalue is +!> real, LAMBDA(i) = REIG(i). +!> If IMEIG(i)>0, then the corresponding complex +!> conjugate pair of eigenvalues reads +!> LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) +!> LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) +!> That is, complex conjugate pairs have consecutive +!> indices (i,i+1), with the positive imaginary part +!> listed first. +!> See the descriptions of K, REIG, and Z. +!> \endverbatim !..... -! Z (workspace/output) REAL(KIND=WP) M-by-N array -! If JOBZ =='V' then -! Z contains real Ritz vectors as follows: -! If IMEIG(i)=0, then Z(:,i) is an eigenvector of -! the i-th Ritz value; ||Z(:,i)||_2=1. -! If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then -! [Z(:,i) Z(:,i+1)] span an invariant subspace and -! the Ritz values extracted from this subspace are -! REIG(i) + sqrt(-1)*IMEIG(i) and -! REIG(i) - sqrt(-1)*IMEIG(i). -! The corresponding eigenvectors are -! Z(:,i) + sqrt(-1)*Z(:,i+1) and -! Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. -! || Z(:,i:i+1)||_F = 1. -! If JOBZ == 'F', then the above descriptions hold for -! the columns of X(:,1:K)*W(1:K,1:K), where the columns -! of W(1:k,1:K) are the computed eigenvectors of the -! K-by-K Rayleigh quotient. The columns of W(1:K,1:K) -! are similarly structured: If IMEIG(i) == 0 then -! X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 -! then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and -! X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) -! are the eigenvectors of LAMBDA(i), LAMBDA(i+1). -! See the descriptions of REIG, IMEIG, X and W. +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) REAL(KIND=WP) M-by-N array +!> If JOBZ =='V' then +!> Z contains real Ritz vectors as follows: +!> If IMEIG(i)=0, then Z(:,i) is an eigenvector of +!> the i-th Ritz value; ||Z(:,i)||_2=1. +!> If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then +!> [Z(:,i) Z(:,i+1)] span an invariant subspace and +!> the Ritz values extracted from this subspace are +!> REIG(i) + sqrt(-1)*IMEIG(i) and +!> REIG(i) - sqrt(-1)*IMEIG(i). +!> The corresponding eigenvectors are +!> Z(:,i) + sqrt(-1)*Z(:,i+1) and +!> Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. +!> || Z(:,i:i+1)||_F = 1. +!> If JOBZ == 'F', then the above descriptions hold for +!> the columns of X(:,1:K)*W(1:K,1:K), where the columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. The columns of W(1:K,1:K) +!> are similarly structured: If IMEIG(i) == 0 then +!> X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 +!> then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and +!> X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) +!> are the eigenvectors of LAMBDA(i), LAMBDA(i+1). +!> See the descriptions of REIG, IMEIG, X and W. +!> \endverbatim !..... -! LDZ (input) INTEGER , LDZ >= M -! The leading dimension of the array Z. +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim !..... -! RES (output) REAL(KIND=WP) N-by-1 array -! RES(1:K) contains the residuals for the K computed -! Ritz pairs. -! If LAMBDA(i) is real, then -! RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. -! If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair -! then -! RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F -! where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] -! [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. -! It holds that -! RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 -! RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 -! where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) -! ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) -! See the description of REIG, IMEIG and Z. +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs. +!> If LAMBDA(i) is real, then +!> RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. +!> If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair +!> then +!> RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F +!> where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] +!> [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. +!> It holds that +!> RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 +!> RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 +!> where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) +!> ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) +!> See the description of REIG, IMEIG and Z. +!> \endverbatim !..... -! B (output) REAL(KIND=WP) M-by-N array. -! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can -! be used for computing the refined vectors; see further -! details in the provided references. -! If JOBF == 'E', B(1:M,1;K) contains -! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the -! Exact DMD, up to scaling by the inverse eigenvalues. -! If JOBF =='N', then B is not referenced. -! See the descriptions of X, W, K. +!> \param[out] B +!> \verbatim +!> B (output) REAL(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1;K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim !..... -! LDB (input) INTEGER, LDB >= M -! The leading dimension of the array B. +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim !..... -! W (workspace/output) REAL(KIND=WP) N-by-N array -! On exit, W(1:K,1:K) contains the K computed -! eigenvectors of the matrix Rayleigh quotient (real and -! imaginary parts for each complex conjugate pair of the -! eigenvalues). The Ritz vectors (returned in Z) are the -! product of X (containing a POD basis for the input -! matrix X) and W. See the descriptions of K, S, X and Z. -! W is also used as a workspace to temporarily store the -! left singular vectors of X. +!> \param[out] W +!> \verbatim +!> W (workspace/output) REAL(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient (real and +!> imaginary parts for each complex conjugate pair of the +!> eigenvalues). The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> left singular vectors of X. +!> \endverbatim !..... -! LDW (input) INTEGER, LDW >= N -! The leading dimension of the array W. +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim !..... -! S (workspace/output) REAL(KIND=WP) N-by-N array -! The array S(1:K,1:K) is used for the matrix Rayleigh -! quotient. This content is overwritten during -! the eigenvalue decomposition by SGEEV. -! See the description of K. +!> \param[out] S +!> \verbatim +!> S (workspace/output) REAL(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by SGEEV. +!> See the description of K. +!> \endverbatim !..... -! LDS (input) INTEGER, LDS >= N -! The leading dimension of the array S. +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim !..... -! WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array -! On exit, WORK(1:N) contains the singular values of -! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). -! If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain -! scaling factor WORK(N+2)/WORK(N+1) used to scale X -! and Y to avoid overflow in the SVD of X. -! This may be of interest if the scaling option is off -! and as many as possible smallest eigenvalues are -! desired to the highest feasible accuracy. -! If the call to SGEDMD is only workspace query, then -! WORK(1) contains the minimal workspace length and -! WORK(2) is the optimal workspace length. Hence, the -! length of work is at least 2. -! See the description of LWORK. +!> \param[out] WORK +!> \verbatim +!> WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +!> On exit, WORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain +!> scaling factor WORK(N+2)/WORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to SGEDMD is only workspace query, then +!> WORK(1) contains the minimal workspace length and +!> WORK(2) is the optimal workspace length. Hence, the +!> length of work is at least 2. +!> See the description of LWORK. +!> \endverbatim !..... -! LWORK (input) INTEGER -! The minimal length of the workspace vector WORK. -! LWORK is calculated as follows: -! If WHTSVD == 1 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). -! If JOBZ == 'N' then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). -! Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal -! workspace length of SGESVD. -! If WHTSVD == 2 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) -! Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the -! minimal workspace length of SGESDD. -! If WHTSVD == 3 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) -! Here LWORK_SVD = N+M+MAX(3*N+1, -! MAX(1,3*N+M,5*N),MAX(1,N)) -! is the minimal workspace length of SGESVDQ. -! If WHTSVD == 4 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) -! Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the -! minimal workspace length of SGEJSV. -! The above expressions are not simplified in order to -! make the usage of WORK more transparent, and for -! easier checking. In any case, LWORK >= 2. -! If on entry LWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for both WORK and -! IWORK. See the descriptions of WORK and IWORK. +!> \param[in] LWORK +!> \verbatim +!> LWORK (input) INTEGER +!> The minimal length of the workspace vector WORK. +!> LWORK is calculated as follows: +!> If WHTSVD == 1 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). +!> If JOBZ == 'N' then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). +!> Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal +!> workspace length of SGESVD. +!> If WHTSVD == 2 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the +!> minimal workspace length of SGESDD. +!> If WHTSVD == 3 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = N+M+MAX(3*N+1, +!> MAX(1,3*N+M,5*N),MAX(1,N)) +!> is the minimal workspace length of SGESVDQ. +!> If WHTSVD == 4 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the +!> minimal workspace length of SGEJSV. +!> The above expressions are not simplified in order to +!> make the usage of WORK more transparent, and for +!> easier checking. In any case, LWORK >= 2. +!> If on entry LWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim !..... -! IWORK (workspace/output) INTEGER LIWORK-by-1 array -! Workspace that is required only if WHTSVD equals -! 2 , 3 or 4. (See the description of WHTSVD). -! If on entry LWORK =-1 or LIWORK=-1, then the -! minimal length of IWORK is computed and returned in -! IWORK(1). See the description of LIWORK. +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim !..... -! LIWORK (input) INTEGER -! The minimal length of the workspace vector IWORK. -! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 -! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) -! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) -! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) -! If on entry LIWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for both WORK and -! IWORK. See the descriptions of WORK and IWORK. +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim !..... -! INFO (output) INTEGER -! -i < 0 :: On entry, the i-th argument had an -! illegal value -! = 0 :: Successful return. -! = 1 :: Void input. Quick exit (M=0 or N=0). -! = 2 :: The SVD computation of X did not converge. -! Suggestion: Check the input data and/or -! repeat with different WHTSVD. -! = 3 :: The computation of the eigenvalues did not -! converge. -! = 4 :: If data scaling was requested on input and -! the procedure found inconsistency in the data -! such that for some column index i, -! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set -! to zero if JOBS=='C'. The computation proceeds -! with original or modified data and warning -! flag is set with INFO=4. +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! !............................................................. !............................................................. + SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, REIG, IMEIG, Z, LDZ, RES, & + B, LDB, W, LDW, S, LDS, & + WORK, LWORK, IWORK, LIWORK, INFO ) +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real32 +! +! Scalar arguments +! ~~~~~~~~~~~~~~~~ + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LWORK, LIWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! +! Array arguments +! ~~~~~~~~~~~~~~~ + REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & + RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +! ! Parameters ! ~~~~~~~~~~ REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP - +! ! Local scalars ! ~~~~~~~~~~~~~ REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & @@ -431,11 +581,11 @@ WNTEX, WNTREF, WNTRES, WNTVEC CHARACTER :: JOBZL, T_OR_N CHARACTER :: JSVOPT - +! ! Local arrays ! ~~~~~~~~~~~~ REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2) - +! ! External functions (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~ REAL(KIND=WP) SLANGE, SLAMCH, SNRM2 @@ -443,13 +593,13 @@ INTEGER ISAMAX LOGICAL SISNAN, LSAME EXTERNAL SISNAN, LSAME - +! ! External subroutines (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~~~~ EXTERNAL SAXPY, SGEMM, SSCAL EXTERNAL SGEEV, SGEJSV, SGESDD, SGESVD, SGESVDQ, & SLACPY, SLASCL, SLASSQ, XERBLA - +! ! Intrinsic functions ! ~~~~~~~~~~~~~~~~~~~ INTRINSIC INT, FLOAT, MAX, SQRT @@ -632,7 +782,8 @@ K = 0 DO i = 1, N !WORK(i) = DNRM2( M, X(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL SLASSQ( M, X(1,i), 1, SCALE, SSUM ) IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN K = 0 @@ -705,7 +856,8 @@ ! carefully computed using SLASSQ. DO i = 1, N !WORK(i) = DNRM2( M, Y(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL SLASSQ( M, Y(1,i), 1, SCALE, SSUM ) IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN K = 0 diff --git a/lapack-netlib/SRC/sgehrd.f b/lapack-netlib/SRC/sgehrd.f index 47733d947..cfa17e156 100644 --- a/lapack-netlib/SRC/sgehrd.f +++ b/lapack-netlib/SRC/sgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -173,7 +173,7 @@ INTEGER IHI, ILO, INFO, LDA, LWORK, N * .. * .. Array Arguments .. - REAL A( LDA, * ), TAU( * ), WORK( * ) + REAL A( LDA, * ), TAU( * ), WORK( * ) * .. * * ===================================================================== @@ -182,7 +182,7 @@ INTEGER NBMAX, LDT, TSIZE PARAMETER ( NBMAX = 64, LDT = NBMAX+1, $ TSIZE = LDT*NBMAX ) - REAL ZERO, ONE + REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, $ ONE = 1.0E+0 ) * .. @@ -190,7 +190,7 @@ LOGICAL LQUERY INTEGER I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB, $ NBMIN, NH, NX - REAL EI + REAL EI * .. * .. External Subroutines .. EXTERNAL SAXPY, SGEHD2, SGEMM, SLAHR2, SLARFB, STRMM, @@ -222,13 +222,19 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'SGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'SGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + ENDIF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -249,7 +255,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -269,7 +274,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of @@ -345,7 +350,8 @@ * Use unblocked code to reduce the rest of the matrix * CALL SGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sgelq.f b/lapack-netlib/SRC/sgelq.f index 74c7cc267..75f02675d 100644 --- a/lapack-netlib/SRC/sgelq.f +++ b/lapack-netlib/SRC/sgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -295,9 +295,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = SROUNDUP_LWORK(LWREQ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -322,7 +322,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWREQ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) RETURN * * End of SGELQ diff --git a/lapack-netlib/SRC/sgelqf.f b/lapack-netlib/SRC/sgelqf.f index 1ceec4742..3b3913d84 100644 --- a/lapack-netlib/SRC/sgelqf.f +++ b/lapack-netlib/SRC/sgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -175,9 +176,8 @@ * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'SGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -185,19 +185,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -267,7 +273,7 @@ $ CALL SGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of SGELQF diff --git a/lapack-netlib/SRC/sgemlq.f b/lapack-netlib/SRC/sgemlq.f index 83536825c..7e4d9bf65 100644 --- a/lapack-netlib/SRC/sgemlq.f +++ b/lapack-netlib/SRC/sgemlq.f @@ -110,13 +110,14 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this *> value as WORK(1), and no error message related to WORK @@ -187,7 +188,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -207,7 +208,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -222,6 +223,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -250,12 +258,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = SROUNDUP_LWORK( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -267,7 +275,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -280,7 +288,7 @@ $ MB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = SROUNDUP_LWORK( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/sgemqr.f b/lapack-netlib/SRC/sgemqr.f index 3207f8bfd..19bf467b8 100644 --- a/lapack-netlib/SRC/sgemqr.f +++ b/lapack-netlib/SRC/sgemqr.f @@ -189,12 +189,13 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMQRT, SLAMTSQR, XERBLA @@ -206,7 +207,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -221,6 +222,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -249,12 +257,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LW) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -266,7 +274,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -279,7 +287,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = SROUNDUP_LWORK(LW) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/sgeqlf.f b/lapack-netlib/SRC/sgeqlf.f index b1266c89e..14942b765 100644 --- a/lapack-netlib/SRC/sgeqlf.f +++ b/lapack-netlib/SRC/sgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is the *> optimal blocksize. *> @@ -189,8 +190,9 @@ END IF WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * diff --git a/lapack-netlib/SRC/sgeqp3rk.c b/lapack-netlib/SRC/sgeqp3rk.c new file mode 100644 index 000000000..fe52901bf --- /dev/null +++ b/lapack-netlib/SRC/sgeqp3rk.c @@ -0,0 +1,1055 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + +/* Check if the matrix A contains +Inf or -Inf, set INFO parameter */ +/* to the column number, where the first +/-Inf is found plus N, */ +/* and continue the computation. */ + + *info = *n + kp1; + + } + +/* ================================================================== */ + +/* Quick return if possible for the case when the first */ +/* stopping criterion is satisfied, i.e. KMAX = 0. */ + + if (*kmax == 0) { + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.f; + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + tau[j] = 0.f; + } + work[1] = (real) lwkopt; + return 0; + } + +/* ================================================================== */ + + eps = slamch_("Epsilon"); + +/* Adjust ABSTOL */ + + if (*abstol >= 0.f) { + safmin = slamch_("Safe minimum"); +/* Computing MAX */ + r__1 = *abstol, r__2 = safmin * 2.f; + *abstol = f2cmax(r__1,r__2); + } + +/* Adjust RELTOL */ + + if (*reltol >= 0.f) { + *reltol = f2cmax(*reltol,eps); + } + +/* =================================================================== */ + +/* JMAX is the maximum index of the column to be factorized, */ +/* which is also limited by the first stopping criterion KMAX. */ + + jmax = f2cmin(*kmax,minmn); + +/* =================================================================== */ + +/* Quick return if possible for the case when the second or third */ +/* stopping criterion for the whole original matrix is satified, */ +/* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */ +/* (which is ONE <= RELTOL). */ + + if (maxc2nrm <= *abstol || 1.f <= *reltol) { + + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.f; + + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + tau[j] = 0.f; + } + + work[1] = (real) lwkopt; + return 0; + } + +/* ================================================================== */ +/* Factorize columns */ +/* ================================================================== */ + +/* Determine the block size. */ + + nbmin = 2; + nx = 0; + + if (nb > 1 && nb < minmn) { + +/* Determine when to cross over from blocked to unblocked code. */ +/* (for N less than NX, unblocked code should be used). */ + +/* Computing MAX */ + i__1 = 0, i__2 = ilaenv_(&c__3, "SGEQP3RK", " ", m, n, &c_n1, &c_n1, ( + ftnlen)8, (ftnlen)1); + nx = f2cmax(i__1,i__2); + + if (nx < minmn) { + +/* Determine if workspace is large enough for blocked code. */ + + if (*lwork < lwkopt) { + +/* Not enough workspace to use optimal block size that */ +/* is currently stored in NB. */ +/* Reduce NB and determine the minimum value of NB. */ + + nb = (*lwork - (*n << 1)) / (*n + 1); +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "SGEQP3RK", " ", m, n, &c_n1, + &c_n1, (ftnlen)8, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + } + } + } + +/* ================================================================== */ + +/* DONE is the boolean flag to rerpresent the case when the */ +/* factorization completed in the block factorization routine, */ +/* before the end of the block. */ + + done = FALSE_; + +/* J is the column index. */ + + j = 1; + +/* (1) Use blocked code initially. */ + +/* JMAXB is the maximum column index of the block, when the */ +/* blocked code is used, is also limited by the first stopping */ +/* criterion KMAX. */ + +/* Computing MIN */ + i__1 = *kmax, i__2 = minmn - nx; + jmaxb = f2cmin(i__1,i__2); + + if (nb >= nbmin && nb < jmax && jmaxb > 0) { + +/* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */ +/* J is the column index of a column block; */ +/* JB is the column block size to pass to block factorization */ +/* routine in a loop step; */ +/* JBF is the number of columns that were actually factorized */ +/* that was returned by the block factorization routine */ +/* in a loop step, JBF <= JB; */ +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + while(j <= jmaxb) { + +/* Computing MIN */ + i__1 = nb, i__2 = jmaxb - j + 1; + jb = f2cmin(i__1,i__2); + n_sub__ = *n - j + 1; + ioffset = j - 1; + +/* Factorize JB columns among the columns A(J:N). */ + + i__1 = *n + *nrhs - j + 1; + slaqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, + &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, + maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], & + work[*n + j], &work[(*n << 1) + 1], &work[(*n << 1) + jb + + 1], &i__1, &iwork[1], &iinfo); + +/* Set INFO on the first occurence of Inf. */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } + + if (done) { + +/* Either the submatrix is zero before the end of the */ +/* column block, or ABSTOL or RELTOL criterion is */ +/* satisfied before the end of the column block, we can */ +/* return from the routine. Perform the following before */ +/* returning: */ +/* a) Set the number of factorized columns K, */ +/* K = IOFFSET + JBF from the last call of blocked */ +/* routine. */ +/* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */ +/* by the block factorization routine; */ +/* 2) The remaining TAUs are set to ZERO by the */ +/* block factorization routine. */ + + *k = ioffset + jbf; + +/* Set INFO on the first occurrence of NaN, NaN takes */ +/* prcedence over Inf. */ + + if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + +/* Return from the routine. */ + + work[1] = (real) lwkopt; + + return 0; + + } + + j += jbf; + + } + + } + +/* Use unblocked code to factor the last or only block. */ +/* J = JMAX+1 means we factorized the maximum possible number of */ +/* columns, that is in ELSE clause we need to compute */ +/* the MAXC2NORM and RELMAXC2NORM to return after we processed */ +/* the blocks. */ + + if (j <= jmax) { + +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + n_sub__ = *n - j + 1; + ioffset = j - 1; + + i__1 = jmax - j + 1; + slaqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, & + maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, + relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], &work[*n + j], & + work[(*n << 1) + 1], &iinfo); + +/* ABSTOL or RELTOL criterion is satisfied when the number of */ +/* the factorized columns KF is smaller then the number */ +/* of columns JMAX-J+1 supplied to be factorized by the */ +/* unblocked routine, we can return from */ +/* the routine. Perform the following before returning: */ +/* a) Set the number of factorized columns K, */ +/* b) MAXC2NRMK and RELMAXC2NRMK are returned by the */ +/* unblocked factorization routine above. */ + + *k = j - 1 + kf; + +/* Set INFO on the first exception occurence. */ + +/* Set INFO on the first exception occurence of Inf or NaN, */ +/* (NaN takes precedence over Inf). */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } else if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + + } else { + +/* Compute the return values for blocked code. */ + +/* Set the number of factorized columns if the unblocked routine */ +/* was not called. */ + + *k = jmax; + +/* If there exits a residual matrix after the blocked code: */ +/* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */ +/* residual matrix, otherwise set them to ZERO; */ +/* 2) Set TAU(K+1:MINMN) to ZERO. */ + + if (*k < minmn) { + i__1 = *n - *k; + jmaxc2nrm = *k + isamax_(&i__1, &work[*k + 1], &c__1); + *maxc2nrmk = work[jmaxc2nrm]; + if (*k == 0) { + *relmaxc2nrmk = 1.f; + } else { + *relmaxc2nrmk = *maxc2nrmk / maxc2nrm; + } + + i__1 = minmn; + for (j = *k + 1; j <= i__1; ++j) { + tau[j] = 0.f; + } + + } + +/* END IF( J.LE.JMAX ) THEN */ + + } + + work[1] = (real) lwkopt; + + return 0; + +/* End of SGEQP3RK */ + +} /* sgeqp3rk_ */ + diff --git a/lapack-netlib/SRC/sgeqp3rk.f b/lapack-netlib/SRC/sgeqp3rk.f new file mode 100644 index 000000000..d3a335b88 --- /dev/null +++ b/lapack-netlib/SRC/sgeqp3rk.f @@ -0,0 +1,1083 @@ +*> \brief \b SGEQP3RK computes a truncated Householder QR factorization with column pivoting of a real m-by-n matrix A by using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SGEQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, +* $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ WORK, LWORK, IWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, K, KMAX, LDA, LWORK, M, N, NRHS +* REAL ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* REAL A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SGEQP3RK performs two tasks simultaneously: +*> +*> Task 1: The routine computes a truncated (rank K) or full rank +*> Householder QR factorization with column pivoting of a real +*> M-by-N matrix A using Level 3 BLAS. K is the number of columns +*> that were factorized, i.e. factorization rank of the +*> factor R, K <= min(M,N). +*> +*> A * P(K) = Q(K) * R(K) = +*> +*> = Q(K) * ( R11(K) R12(K) ) = Q(K) * ( R(K)_approx ) +*> ( 0 R22(K) ) ( 0 R(K)_residual ), +*> +*> where: +*> +*> P(K) is an N-by-N permutation matrix; +*> Q(K) is an M-by-M orthogonal matrix; +*> R(K)_approx = ( R11(K), R12(K) ) is a rank K approximation of the +*> full rank factor R with K-by-K upper-triangular +*> R11(K) and K-by-N rectangular R12(K). The diagonal +*> entries of R11(K) appear in non-increasing order +*> of absolute value, and absolute values of all of +*> them exceed the maximum column 2-norm of R22(K) +*> up to roundoff error. +*> R(K)_residual = R22(K) is the residual of a rank K approximation +*> of the full rank factor R. It is a +*> an (M-K)-by-(N-K) rectangular matrix; +*> 0 is a an (M-K)-by-K zero matrix. +*> +*> Task 2: At the same time, the routine overwrites a real M-by-NRHS +*> matrix B with Q(K)**T * B using Level 3 BLAS. +*> +*> ===================================================================== +*> +*> The matrices A and B are stored on input in the array A as +*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS) +*> respectively. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> The truncation criteria (i.e. when to stop the factorization) +*> can be any of the following: +*> +*> 1) The input parameter KMAX, the maximum number of columns +*> KMAX to factorize, i.e. the factorization rank is limited +*> to KMAX. If KMAX >= min(M,N), the criterion is not used. +*> +*> 2) The input parameter ABSTOL, the absolute tolerance for +*> the maximum column 2-norm of the residual matrix R22(K). This +*> means that the factorization stops if this norm is less or +*> equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used. +*> +*> 3) The input parameter RELTOL, the tolerance for the maximum +*> column 2-norm matrix of the residual matrix R22(K) divided +*> by the maximum column 2-norm of the original matrix A, which +*> is equal to abs(R(1,1)). This means that the factorization stops +*> when the ratio of the maximum column 2-norm of R22(K) to +*> the maximum column 2-norm of A is less than or equal to RELTOL. +*> If RELTOL < 0.0, the criterion is not used. +*> +*> 4) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix R22(K) is a zero matrix in some +*> factorization step K. ( This stopping criterion is implicit. ) +*> +*> The algorithm stops when any of these conditions is first +*> satisfied, otherwise the whole matrix A is factorized. +*> +*> To factorize the whole matrix A, use the values +*> KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0. +*> +*> The routine returns: +*> a) Q(K), R(K)_approx = ( R11(K), R12(K) ), +*> R(K)_residual = R22(K), P(K), i.e. the resulting matrices +*> of the factorization; P(K) is represented by JPIV, +*> ( if K = min(M,N), R(K)_approx is the full factor R, +*> and there is no residual matrix R(K)_residual); +*> b) K, the number of columns that were factorized, +*> i.e. factorization rank; +*> c) MAXC2NRMK, the maximum column 2-norm of the residual +*> matrix R(K)_residual = R22(K), +*> ( if K = min(M,N), MAXC2NRMK = 0.0 ); +*> d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum +*> column 2-norm of the original matrix A, which is equal +*> to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 ); +*> e) Q(K)**T * B, the matrix B with the orthogonal +*> transformation Q(K)**T applied on the left. +*> +*> The N-by-N permutation matrix P(K) is stored in a compact form in +*> the integer array JPIV. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The M-by-M orthogonal matrix Q is represented as a product +*> of elementary Householder reflectors +*> +*> Q(K) = H(1) * H(2) * . . . * H(K), +*> +*> where K is the number of columns that were factorized. +*> +*> Each H(j) has the form +*> +*> H(j) = I - tau * v * v**T, +*> +*> where 1 <= j <= K and +*> I is an M-by-M identity matrix, +*> tau is a real scalar, +*> v is a real vector with v(1:j-1) = 0 and v(j) = 1. +*> +*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j). +*> +*> See the Further Details section for more information. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e. the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M,N), then this stopping criterion +*> is not used, the routine factorizes columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B are not modified, and +*> the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL +*> +*> The second factorization stopping criterion, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix R22(K). +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix R22(K) +*> is less than or equal to ABSTOL. Let SAFMIN = SLAMCH('S'). +*> +*> a) If ABSTOL is NaN, then no computation is performed +*> and an error message ( INFO = -5 ) is issued +*> by XERBLA. +*> +*> b) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN +*> is used. This includes the case ABSTOL = -0.0. +*> +*> d) If 2*SAFMIN <= ABSTOL then the input value +*> of ABSTOL is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If ABSTOL chosen above is >= MAXC2NRM, then this +*> stopping criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. The routine +*> returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case ABSTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is REAL +*> +*> The third factorization stopping criterion, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio +*> abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of +*> the residual matrix R22(K) to the maximum column 2-norm of +*> the original matrix A. The algorithm converges (stops the +*> factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less +*> than or equal to RELTOL. Let EPS = SLAMCH('E'). +*> +*> a) If RELTOL is NaN, then no computation is performed +*> and an error message ( INFO = -6 ) is issued +*> by XERBLA. +*> +*> b) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used. +*> This includes the case RELTOL = -0.0. +*> +*> d) If EPS <= RELTOL then the input value of RELTOL +*> is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If RELTOL chosen above is >= 1.0, then this stopping +*> criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. +*> The routine returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case RELTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> +*> NOTE: We recommend that RELTOL satisfy +*> min( max(M,N)*EPS, sqrt(EPS) ) <= RELTOL +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N+NRHS) +*> +*> On entry: +*> +*> a) The subarray A(1:M,1:N) contains the M-by-N matrix A. +*> b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS +*> matrix B. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> +*> a) The subarray A(1:M,1:N) contains parts of the factors +*> of the matrix A: +*> +*> 1) If K = 0, A(1:M,1:N) contains the original matrix A. +*> 2) If K > 0, A(1:M,1:N) contains parts of the +*> factors: +*> +*> 1. The elements below the diagonal of the subarray +*> A(1:M,1:K) together with TAU(1:K) represent the +*> orthogonal matrix Q(K) as a product of K Householder +*> elementary reflectors. +*> +*> 2. The elements on and above the diagonal of +*> the subarray A(1:K,1:N) contain K-by-N +*> upper-trapezoidal matrix +*> R(K)_approx = ( R11(K), R12(K) ). +*> NOTE: If K=min(M,N), i.e. full rank factorization, +*> then R_approx(K) is the full factor R which +*> is upper-trapezoidal. If, in addition, M>=N, +*> then R is upper-triangular. +*> +*> 3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K) +*> rectangular matrix R(K)_residual = R22(K). +*> +*> b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains +*> the M-by-NRHS product Q(K)**T * B. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> This is the leading dimension for both matrices, A and B. +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> +*> NOTE: If K = 0, a) the arrays A and B are not modified; +*> b) the array TAU(1:min(M,N)) is set to ZERO, +*> if the matrix A does not contain NaN, +*> otherwise the elements TAU(1:min(M,N)) +*> are undefined; +*> c) the elements of the array JPIV are set +*> as follows: for j = 1:N, JPIV(j) = j. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is REAL +*> The maximum column 2-norm of the residual matrix R22(K), +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then MAXC2NRMK equals the maximum column 2-norm +*> of the original matrix A. +*> +*> b) If 0 < K < min(M,N), then MAXC2NRMK is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then MAXC2NRMK = 0.0. +*> +*> NOTE: MAXC2NRMK in the factorization step K would equal +*> R(K+1,K+1) in the next factorization step K+1. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is REAL +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix R22(K) (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then RELMAXC2NRMK = 1.0. +*> +*> b) If 0 < K < min(M,N), then +*> RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then RELMAXC2NRMK = 0.0. +*> +*> NOTE: RELMAXC2NRMK in the factorization step K would equal +*> abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization +*> step K+1. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The elements of the array JPIV(1:N) are always set +*> by the routine, for example, even when no columns +*> were factorized, i.e. when K = 0, the elements are +*> set as JPIV(j) = j for j = 1:N. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is REAL array, dimension (min(M,N)) +*> The scalar factors of the elementary reflectors. +*> +*> If 0 < K <= min(M,N), only the elements TAU(1:K) of +*> the array TAU are modified by the factorization. +*> After the factorization computed, if no NaN was found +*> during the factorization, the remaining elements +*> TAU(K+1:min(M,N)) are set to zero, otherwise the +*> elements TAU(K+1:min(M,N)) are not set and therefore +*> undefined. +*> ( If K = 0, all elements of TAU are set to zero, if +*> the matrix A does not contain NaN. ) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= (3*N+NRHS-1), otherwise. +*> For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )), +*> where NB is the optimal block size for SGEQP3RK returned +*> by ILAENV. Minimal block size MINNB=2. +*> +*> NOTE: The decision, whether to use unblocked BLAS 2 +*> or blocked BLAS 3 code is based not only on the dimension +*> LWORK of the availbale workspace WORK, but also also on the +*> matrix A dimension N via crossover point NX returned +*> by ILAENV. (For N less than NX, unblocked code should be +*> used.) +*> +*> If LWORK = -1, then a workspace query is assumed; +*> the routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix in the blocked step auxiliary subroutine SLAQP3RK ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) INFO < 0: if INFO = -i, the i-th argument had an +*> illegal value. +*> 3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup geqp3rk +* +*> \par Further Details: +* ===================== +* +*> \verbatim +*> SGEQP3RK is based on the same BLAS3 Householder QR factorization +*> algorithm with column pivoting as in SGEQP3 routine which uses +*> SLARFG routine to generate Householder reflectors +*> for QR factorization. +*> +*> We can also write: +*> +*> A = A_approx(K) + A_residual(K) +*> +*> The low rank approximation matrix A(K)_approx from +*> the truncated QR factorization of rank K of the matrix A is: +*> +*> A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T +*> ( 0 0 ) +*> +*> = Q(K) * ( R11(K) R12(K) ) * P(K)**T +*> ( 0 0 ) +*> +*> The residual A_residual(K) of the matrix A is: +*> +*> A_residual(K) = Q(K) * ( 0 0 ) * P(K)**T = +*> ( 0 R(K)_residual ) +*> +*> = Q(K) * ( 0 0 ) * P(K)**T +*> ( 0 R22(K) ) +*> +*> The truncated (rank K) factorization guarantees that +*> the maximum column 2-norm of A_residual(K) is less than +*> or equal to MAXC2NRMK up to roundoff error. +*> +*> NOTE: An approximation of the null vectors +*> of A can be easily computed from R11(K) +*> and R12(K): +*> +*> Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) ) +*> ( -I ) +*> +*> \endverbatim +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, + $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ WORK, LWORK, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS + REAL ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + REAL A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER INB, INBMIN, IXOVER + PARAMETER ( INB = 1, INBMIN = 2, IXOVER = 3 ) + REAL ZERO, ONE, TWO + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, DONE + INTEGER IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX, + $ JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB, + $ NBMIN, NX + REAL EPS, HUGEVAL, MAXC2NRM, SAFMIN +* .. +* .. External Subroutines .. + EXTERNAL SLAQP2RK, SLAQP3RK, XERBLA +* .. +* .. External Functions .. + LOGICAL SISNAN + INTEGER ISAMAX, ILAENV + REAL SLAMCH, SNRM2, SROUNDUP_LWORK + EXTERNAL SISNAN, SLAMCH, SNRM2, ISAMAX, ILAENV, + $ SROUNDUP_LWORK +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test input arguments +* ==================== +* + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( KMAX.LT.0 ) THEN + INFO = -4 + ELSE IF( SISNAN( ABSTOL ) ) THEN + INFO = -5 + ELSE IF( SISNAN( RELTOL ) ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -8 + END IF +* +* If the input parameters M, N, NRHS, KMAX, LDA are valid: +* a) Test the input workspace size LWORK for the minimum +* size requirement IWS. +* b) Determine the optimal block size NB and optimal +* workspace size LWKOPT to be returned in WORK(1) +* in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., +* (3) when routine exits. +* Here, IWS is the miminum workspace required for unblocked +* code. +* + IF( INFO.EQ.0 ) THEN + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + IWS = 1 + LWKOPT = 1 + ELSE +* +* Minimal workspace size in case of using only unblocked +* BLAS 2 code in SLAQP2RK. +* 1) SGEQP3RK and SLAQP2RK: 2*N to store full and partial +* column 2-norms. +* 2) SLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in SLARF subroutine inside SLAQP2RK to apply an +* elementary reflector from the left. +* TOTAL_WORK_SIZE = 3*N + NRHS - 1 +* + IWS = 3*N + NRHS - 1 +* +* Assign to NB optimal block size. +* + NB = ILAENV( INB, 'SGEQP3RK', ' ', M, N, -1, -1 ) +* +* A formula for the optimal workspace size in case of using +* both unblocked BLAS 2 in SLAQP2RK and blocked BLAS 3 code +* in SLAQP3RK. +* 1) SGEQP3RK, SLAQP2RK, SLAQP3RK: 2*N to store full and +* partial column 2-norms. +* 2) SLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in SLARF subroutine to apply an elementary reflector +* from the left. +* 3) SLAQP3RK: NB*(N+NRHS) to use in the work array F that +* is used to apply a block reflector from +* the left. +* 4) SLAQP3RK: NB to use in the auxilixary array AUX. +* Sizes (2) and ((3) + (4)) should intersect, therefore +* TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. +* + LWKOPT = 2*N + NB*( N+NRHS+1 ) + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* + IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN + INFO = -15 + END IF + END IF +* +* NOTE: The optimal workspace size is returned in WORK(1), if +* the input parameters M, N, NRHS, KMAX, LDA are valid. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGEQP3RK', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible for M=0 or N=0. +* + IF( MINMN.EQ.0 ) THEN + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* +* Initialize column pivot array JPIV. +* + DO J = 1, N + JPIV( J ) = J + END DO +* +* ================================================================== +* +* Initialize storage for partial and exact column 2-norms. +* a) The elements WORK(1:N) are used to store partial column +* 2-norms of the matrix A, and may decrease in each computation +* step; initialize to the values of complete columns 2-norms. +* b) The elements WORK(N+1:2*N) are used to store complete column +* 2-norms of the matrix A, they are not changed during the +* computation; initialize the values of complete columns 2-norms. +* + DO J = 1, N + WORK( J ) = SNRM2( M, A( 1, J ), 1 ) + WORK( N+J ) = WORK( J ) + END DO +* +* ================================================================== +* +* Compute the pivot column index and the maximum column 2-norm +* for the whole original matrix stored in A(1:M,1:N). +* + KP1 = ISAMAX( N, WORK( 1 ), 1 ) + MAXC2NRM = WORK( KP1 ) +* +* ==================================================================. +* + IF( SISNAN( MAXC2NRM ) ) THEN +* +* Check if the matrix A contains NaN, set INFO parameter +* to the column number where the first NaN is found and return +* from the routine. +* + K = 0 + INFO = KP1 +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = MAXC2NRM +* +* Array TAU is not set and contains undefined elements. +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN + END IF +* +* =================================================================== +* + IF( MAXC2NRM.EQ.ZERO ) THEN +* +* Check is the matrix A is a zero matrix, set array TAU and +* return from the routine. +* + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO +* + DO J = 1, MINMN + TAU( J ) = ZERO + END DO +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN +* + END IF +* +* =================================================================== +* + HUGEVAL = SLAMCH( 'Overflow' ) +* + IF( MAXC2NRM.GT.HUGEVAL ) THEN +* +* Check if the matrix A contains +Inf or -Inf, set INFO parameter +* to the column number, where the first +/-Inf is found plus N, +* and continue the computation. +* + INFO = N + KP1 +* + END IF +* +* ================================================================== +* +* Quick return if possible for the case when the first +* stopping criterion is satisfied, i.e. KMAX = 0. +* + IF( KMAX.EQ.0 ) THEN + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE + DO J = 1, MINMN + TAU( J ) = ZERO + END DO + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* + EPS = SLAMCH('Epsilon') +* +* Adjust ABSTOL +* + IF( ABSTOL.GE.ZERO ) THEN + SAFMIN = SLAMCH('Safe minimum') + ABSTOL = MAX( ABSTOL, TWO*SAFMIN ) + END IF +* +* Adjust RELTOL +* + IF( RELTOL.GE.ZERO ) THEN + RELTOL = MAX( RELTOL, EPS ) + END IF +* +* =================================================================== +* +* JMAX is the maximum index of the column to be factorized, +* which is also limited by the first stopping criterion KMAX. +* + JMAX = MIN( KMAX, MINMN ) +* +* =================================================================== +* +* Quick return if possible for the case when the second or third +* stopping criterion for the whole original matrix is satified, +* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL +* (which is ONE <= RELTOL). +* + IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN +* + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE +* + DO J = 1, MINMN + TAU( J ) = ZERO + END DO +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* Factorize columns +* ================================================================== +* +* Determine the block size. +* + NBMIN = 2 + NX = 0 +* + IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN +* +* Determine when to cross over from blocked to unblocked code. +* (for N less than NX, unblocked code should be used). +* + NX = MAX( 0, ILAENV( IXOVER, 'SGEQP3RK', ' ', M, N, -1, -1 )) +* + IF( NX.LT.MINMN ) THEN +* +* Determine if workspace is large enough for blocked code. +* + IF( LWORK.LT.LWKOPT ) THEN +* +* Not enough workspace to use optimal block size that +* is currently stored in NB. +* Reduce NB and determine the minimum value of NB. +* + NB = ( LWORK-2*N ) / ( N+1 ) + NBMIN = MAX( 2, ILAENV( INBMIN, 'SGEQP3RK', ' ', M, N, + $ -1, -1 ) ) +* + END IF + END IF + END IF +* +* ================================================================== +* +* DONE is the boolean flag to rerpresent the case when the +* factorization completed in the block factorization routine, +* before the end of the block. +* + DONE = .FALSE. +* +* J is the column index. +* + J = 1 +* +* (1) Use blocked code initially. +* +* JMAXB is the maximum column index of the block, when the +* blocked code is used, is also limited by the first stopping +* criterion KMAX. +* + JMAXB = MIN( KMAX, MINMN - NX ) +* + IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN +* +* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: +* J is the column index of a column block; +* JB is the column block size to pass to block factorization +* routine in a loop step; +* JBF is the number of columns that were actually factorized +* that was returned by the block factorization routine +* in a loop step, JBF <= JB; +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + DO WHILE( J.LE.JMAXB ) +* + JB = MIN( NB, JMAXB-J+1 ) + N_SUB = N-J+1 + IOFFSET = J-1 +* +* Factorize JB columns among the columns A(J:N). +* + CALL SLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ DONE, JBF, MAXC2NRMK, RELMAXC2NRMK, + $ JPIV( J ), TAU( J ), + $ WORK( J ), WORK( N+J ), + $ WORK( 2*N+1 ), WORK( 2*N+JB+1 ), + $ N+NRHS-J+1, IWORK, IINFO ) +* +* Set INFO on the first occurence of Inf. +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + END IF +* + IF( DONE ) THEN +* +* Either the submatrix is zero before the end of the +* column block, or ABSTOL or RELTOL criterion is +* satisfied before the end of the column block, we can +* return from the routine. Perform the following before +* returning: +* a) Set the number of factorized columns K, +* K = IOFFSET + JBF from the last call of blocked +* routine. +* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned +* by the block factorization routine; +* 2) The remaining TAUs are set to ZERO by the +* block factorization routine. +* + K = IOFFSET + JBF +* +* Set INFO on the first occurrence of NaN, NaN takes +* prcedence over Inf. +* + IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* +* Return from the routine. +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* + RETURN +* + END IF +* + J = J + JBF +* + END DO +* + END IF +* +* Use unblocked code to factor the last or only block. +* J = JMAX+1 means we factorized the maximum possible number of +* columns, that is in ELSE clause we need to compute +* the MAXC2NORM and RELMAXC2NORM to return after we processed +* the blocks. +* + IF( J.LE.JMAX ) THEN +* +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + N_SUB = N-J+1 + IOFFSET = J-1 +* + CALL SLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1, + $ ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ), + $ TAU( J ), WORK( J ), WORK( N+J ), + $ WORK( 2*N+1 ), IINFO ) +* +* ABSTOL or RELTOL criterion is satisfied when the number of +* the factorized columns KF is smaller then the number +* of columns JMAX-J+1 supplied to be factorized by the +* unblocked routine, we can return from +* the routine. Perform the following before returning: +* a) Set the number of factorized columns K, +* b) MAXC2NRMK and RELMAXC2NRMK are returned by the +* unblocked factorization routine above. +* + K = J - 1 + KF +* +* Set INFO on the first exception occurence. +* +* Set INFO on the first exception occurence of Inf or NaN, +* (NaN takes precedence over Inf). +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* + ELSE +* +* Compute the return values for blocked code. +* +* Set the number of factorized columns if the unblocked routine +* was not called. +* + K = JMAX +* +* If there exits a residual matrix after the blocked code: +* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the +* residual matrix, otherwise set them to ZERO; +* 2) Set TAU(K+1:MINMN) to ZERO. +* + IF( K.LT.MINMN ) THEN + JMAXC2NRM = K + ISAMAX( N-K, WORK( K+1 ), 1 ) + MAXC2NRMK = WORK( JMAXC2NRM ) + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + DO J = K + 1, MINMN + TAU( J ) = ZERO + END DO +* + END IF +* +* END IF( J.LE.JMAX ) THEN +* + END IF +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* + RETURN +* +* End of SGEQP3RK +* + END diff --git a/lapack-netlib/SRC/sgeqr.f b/lapack-netlib/SRC/sgeqr.f index 6f41a92ea..79a515e1c 100644 --- a/lapack-netlib/SRC/sgeqr.f +++ b/lapack-netlib/SRC/sgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE SGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,11 +190,13 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLATSQR, SGEQRT, XERBLA @@ -244,8 +248,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +259,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +274,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +288,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +315,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) * RETURN * diff --git a/lapack-netlib/SRC/sgeqrfp.f b/lapack-netlib/SRC/sgeqrfp.f index d1ee2a828..37747c512 100644 --- a/lapack-netlib/SRC/sgeqrfp.f +++ b/lapack-netlib/SRC/sgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL SGEQR2P, SLARFB, SLARFT, XERBLA @@ -173,8 +174,9 @@ * .. * .. External Functions .. INTEGER ILAENV + EXTERNAL ILAENV REAL SROUNDUP_LWORK - EXTERNAL ILAENV, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -182,8 +184,16 @@ * INFO = 0 NB = ILAENV( 1, 'SGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -191,7 +201,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -211,7 +221,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. @@ -273,7 +283,7 @@ $ CALL SGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of SGEQRFP diff --git a/lapack-netlib/SRC/sgesvj.f b/lapack-netlib/SRC/sgesvj.f index 3f53a5a15..36aed2853 100644 --- a/lapack-netlib/SRC/sgesvj.f +++ b/lapack-netlib/SRC/sgesvj.f @@ -208,7 +208,7 @@ *> *> \param[in,out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On entry, *> If JOBU = 'C' : *> WORK(1) = CTOL, where CTOL defines the threshold for convergence. @@ -239,7 +239,12 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> length of WORK, WORK >= MAX(6,M+N) +*> Length of WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(6,M+N), otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; WORK(1) is set to the minial (and optimal) +*> length of WORK. *> \endverbatim *> *> \param[out] INFO @@ -260,7 +265,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -351,9 +356,9 @@ INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, - $ SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + $ SWBAND, MINMN, LWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, + $ ROTOK, RSVEC, UCTOL, UPPER * .. * .. Local Arrays .. REAL FASTR( 5 ) @@ -369,8 +374,8 @@ INTEGER ISAMAX EXTERNAL ISAMAX * from LAPACK - REAL SLAMCH - EXTERNAL SLAMCH + REAL SLAMCH, SROUNDUP_LWORK + EXTERNAL SLAMCH, SROUNDUP_LWORK LOGICAL LSAME EXTERNAL LSAME * .. @@ -394,6 +399,14 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 6, M+N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -413,7 +426,7 @@ INFO = -11 ELSE IF( UCTOL .AND. ( WORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( LWORK.LT.MAX( M+N, 6 ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 ELSE INFO = 0 @@ -423,11 +436,14 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGESVJ', -INFO ) RETURN + ELSE IF( LQUERY ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/sgetri.f b/lapack-netlib/SRC/sgetri.f index fe71bc4a5..7b06bb63d 100644 --- a/lapack-netlib/SRC/sgetri.f +++ b/lapack-netlib/SRC/sgetri.f @@ -137,8 +137,9 @@ * .. * .. External Functions .. INTEGER ILAENV + EXTERNAL ILAENV REAL SROUNDUP_LWORK - EXTERNAL ILAENV, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMM, SGEMV, SSWAP, STRSM, STRTRI, XERBLA @@ -152,8 +153,9 @@ * INFO = 0 NB = ILAENV( 1, 'SGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -251,7 +253,7 @@ $ CALL SSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) 60 CONTINUE * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of SGETRI diff --git a/lapack-netlib/SRC/sgetsls.f b/lapack-netlib/SRC/sgetsls.f index d89c6a4e6..08a427a8b 100644 --- a/lapack-netlib/SRC/sgetsls.f +++ b/lapack-netlib/SRC/sgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -226,7 +226,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEO = 1 + WSIZEM = 1 + ELSE IF( M.GE.N ) THEN CALL SGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) diff --git a/lapack-netlib/SRC/sgetsqrhrt.f b/lapack-netlib/SRC/sgetsqrhrt.f index d80ff4da8..7ade8a66c 100644 --- a/lapack-netlib/SRC/sgetsqrhrt.f +++ b/lapack-netlib/SRC/sgetsqrhrt.f @@ -130,14 +130,17 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -216,7 +219,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -229,7 +232,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -267,8 +270,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * @@ -350,4 +354,4 @@ * * End of SGETSQRHRT * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/sgges3.f b/lapack-netlib/SRC/sgges3.f index e35d4955a..e90cd6947 100644 --- a/lapack-netlib/SRC/sgges3.f +++ b/lapack-netlib/SRC/sgges3.f @@ -234,6 +234,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK >= 6*N+16. +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -309,7 +311,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, LST2SL, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT + $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT, + $ LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SAFMAX, SAFMIN, SMLNUM * .. @@ -361,6 +364,12 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + IF( N.EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 6*N+16 + END IF +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -377,7 +386,7 @@ INFO = -15 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -17 - ELSE IF( LWORK.LT.6*N+16 .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -19 END IF * @@ -385,7 +394,7 @@ * IF( INFO.EQ.0 ) THEN CALL SGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 6*N+16, 3*N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL SORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) @@ -407,7 +416,11 @@ $ IERR ) LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) END IF - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + END IF END IF * IF( INFO.NE.0 ) THEN @@ -421,6 +434,7 @@ * IF( N.EQ.0 ) THEN SDIM = 0 + WORK( 1 ) = 1 RETURN END IF * @@ -657,7 +671,7 @@ * 40 CONTINUE * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sggev3.f b/lapack-netlib/SRC/sggev3.f index c82d2187f..d788d1147 100644 --- a/lapack-netlib/SRC/sggev3.f +++ b/lapack-netlib/SRC/sggev3.f @@ -189,6 +189,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. LWORK >= MAX(1,8*N). +*> For good performance, LWORK should generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -248,7 +250,8 @@ LOGICAL ILASCL, ILBSCL, ILV, ILVL, ILVR, LQUERY CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, - $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT + $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT, + $ LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP * .. @@ -298,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 8*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -312,7 +316,7 @@ INFO = -12 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -14 - ELSE IF( LWORK.LT.MAX( 1, 8*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF * @@ -320,28 +324,31 @@ * IF( INFO.EQ.0 ) THEN CALL SGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, 8*N, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL SORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL SGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL, LDVL, $ VR, LDVR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) IF( ILVL ) THEN CALL SORGQR( N, N, N, VL, LDVL, WORK, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL SLAQZ0( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) ELSE CALL SLAQZ0( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF - WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) -* END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/sgghd3.f b/lapack-netlib/SRC/sgghd3.f index 9c5858b5a..01e57088a 100644 --- a/lapack-netlib/SRC/sgghd3.f +++ b/lapack-netlib/SRC/sgghd3.f @@ -179,14 +179,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> -*> \param[in] LWORK +*> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -276,7 +276,12 @@ * INFO = 0 NB = ILAENV( 1, 'SGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) @@ -326,7 +331,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = ONE RETURN @@ -886,6 +890,7 @@ IF ( JCOL.LT.IHI ) $ CALL SGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) +* WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN diff --git a/lapack-netlib/SRC/sggqrf.f b/lapack-netlib/SRC/sggqrf.f index ebb42a899..d32b48410 100644 --- a/lapack-netlib/SRC/sggqrf.f +++ b/lapack-netlib/SRC/sggqrf.f @@ -236,8 +236,9 @@ * .. * .. External Functions .. INTEGER ILAENV + EXTERNAL ILAENV REAL SROUNDUP_LWORK - EXTERNAL ILAENV, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -251,8 +252,9 @@ NB2 = ILAENV( 1, 'SGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'SORMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -289,6 +291,7 @@ * CALL SGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO ) LWKOPT = MAX( LOPT, INT( WORK( 1 ) ) ) +* WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN diff --git a/lapack-netlib/SRC/sggrqf.f b/lapack-netlib/SRC/sggrqf.f index 2163f1ef8..b3842ec2a 100644 --- a/lapack-netlib/SRC/sggrqf.f +++ b/lapack-netlib/SRC/sggrqf.f @@ -250,7 +250,7 @@ NB2 = ILAENV( 1, 'SGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'SORMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P)*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN diff --git a/lapack-netlib/SRC/sggsvd3.f b/lapack-netlib/SRC/sggsvd3.f index 053fff5de..cee630593 100644 --- a/lapack-netlib/SRC/sggsvd3.f +++ b/lapack-netlib/SRC/sggsvd3.f @@ -278,7 +278,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/lapack-netlib/SRC/sggsvp3.f b/lapack-netlib/SRC/sggsvp3.f index a463b9064..8e90d770c 100644 --- a/lapack-netlib/SRC/sggsvp3.f +++ b/lapack-netlib/SRC/sggsvp3.f @@ -227,7 +227,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -300,8 +300,9 @@ * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEQP3, SGEQR2, SGERQ2, SLACPY, SLAPMT, diff --git a/lapack-netlib/SRC/slamswlq.f b/lapack-netlib/SRC/slamswlq.f index d4996b1f2..432afaded 100644 --- a/lapack-netlib/SRC/slamswlq.f +++ b/lapack-netlib/SRC/slamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -189,33 +192,38 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamswlq +*> * ===================================================================== SUBROUTINE SLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + REAL A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL STPMLQT, SGEMLQT, XERBLA * .. @@ -223,52 +231,60 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * INFO = 0 IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLAMSWLQ', -INFO ) - WORK(1) = LW RETURN - ELSE IF (LQUERY) THEN - WORK(1) = LW + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -402,7 +418,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLAMSWLQ diff --git a/lapack-netlib/SRC/slamtsqr.f b/lapack-netlib/SRC/slamtsqr.f index 960b794de..f9b167aea 100644 --- a/lapack-netlib/SRC/slamtsqr.f +++ b/lapack-netlib/SRC/slamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) -*> +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -191,33 +193,38 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamtsqr +*> * ===================================================================== SUBROUTINE SLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + REAL A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL SGEMQRT, STPMQRT, XERBLA * .. @@ -225,12 +232,13 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -238,11 +246,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -253,38 +267,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN. AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = LW + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL SGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -410,7 +424,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLAMTSQR diff --git a/lapack-netlib/SRC/slaqp2rk.c b/lapack-netlib/SRC/slaqp2rk.c new file mode 100644 index 000000000..0bfa71ab9 --- /dev/null +++ b/lapack-netlib/SRC/slaqp2rk.c @@ -0,0 +1,918 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + *info = *n + kk - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL >= ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL >= ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + +/* Set K, the number of factorized columns. */ + + *k = kk - 1; + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. */ + + i__2 = minmnfact; + for (j = kk; j <= i__2; ++j) { + tau[j] = 0.f; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,KK:N): */ +/* 1) swap the KK-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) copy the KK-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than KK in the next loop step.) */ +/* 3) Save the pivot interchange with the indices relative to the */ +/* the original matrix A, not the block A(1:M,1:N). */ + + if (kp != kk) { + sswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1); + vn1[kp] = vn1[kk]; + vn2[kp] = vn2[kk]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[kk]; + jpiv[kk] = itemp; + } + +/* Generate elementary reflector H(KK) using the column A(I:M,KK), */ +/* if the column has more than one element, otherwise */ +/* the elementary reflector would be an identity matrix, */ +/* and TAU(KK) = ZERO. */ + + if (i__ < *m) { + i__2 = *m - i__ + 1; + slarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], & + c__1, &tau[kk]); + } else { + tau[kk] = 0.f; + } + +/* Check if TAU(KK) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(KK) for Inf, */ +/* since SLARFG cannot produce TAU(KK) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by SLARFG can contain Inf, which requires */ +/* TAU(KK) to contain NaN. Therefore, this case of generating Inf */ +/* by SLARFG is covered by checking TAU(KK) for NaN. */ + + if (sisnan_(&tau[kk])) { + *k = kk - 1; + *info = kk; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = tau[kk]; + *relmaxc2nrmk = tau[kk]; + +/* Array TAU(KK:MINMNFACT) is not set and contains */ +/* undefined elements, except the first element TAU(KK) = NaN. */ + + return 0; + } + +/* Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left. */ +/* ( If M >= N, then at KK = N there is no residual matrix, */ +/* i.e. no columns of A to update, only columns of B. */ +/* If M < N, then at KK = M-IOFFSET, I = M and we have a */ +/* one-row residual matrix in A and the elementary */ +/* reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update */ +/* is needed for the residual matrix in A and the */ +/* right-hand-side-matrix in B. */ +/* Therefore, we update only if */ +/* KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */ +/* condition is satisfied, not only KK < N+NRHS ) */ + + if (kk < minmnupdt) { + aikk = a[i__ + kk * a_dim1]; + a[i__ + kk * a_dim1] = 1.f; + i__2 = *m - i__ + 1; + i__3 = *n + *nrhs - kk; + slarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &tau[ + kk], &a[i__ + (kk + 1) * a_dim1], lda, &work[1]); + a[i__ + kk * a_dim1] = aikk; + } + + if (kk < minmnfact) { + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */ +/* when KK < f2cmin(M-IOFFSET, N). */ + + i__2 = *n; + for (j = kk + 1; j <= i__2; ++j) { + if (vn1[j] != 0.f) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + +/* Computing 2nd power */ + r__2 = (r__1 = a[i__ + j * a_dim1], abs(r__1)) / vn1[j]; + temp = 1.f - r__2 * r__2; + temp = f2cmax(temp,0.f); +/* Computing 2nd power */ + r__1 = vn1[j] / vn2[j]; + temp2 = temp * (r__1 * r__1); + if (temp2 <= tol3z) { + +/* Compute the column 2-norm for the partial */ +/* column A(I+1:M,J) by explicitly computing it, */ +/* and store it in both partial 2-norm vector VN1 */ +/* and exact column 2-norm vector VN2. */ + + i__3 = *m - i__; + vn1[j] = snrm2_(&i__3, &a[i__ + 1 + j * a_dim1], & + c__1); + vn2[j] = vn1[j]; + + } else { + +/* Update the column 2-norm for the partial */ +/* column A(I+1:M,J) by removing one */ +/* element A(I,J) and store it in partial */ +/* 2-norm vector VN1. */ + + vn1[j] *= sqrt(temp); + + } + } + } + + } + +/* End factorization loop */ + + } + +/* If we reached this point, all colunms have been factorized, */ +/* i.e. no condition was triggered to exit the routine. */ +/* Set the number of factorized columns. */ + + *k = *kmax; + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */ +/* we return. */ + + if (*k < minmnfact) { + + i__1 = *n - *k; + jmaxc2nrm = *k + isamax_(&i__1, &vn1[*k + 1], &c__1); + *maxc2nrmk = vn1[jmaxc2nrm]; + + if (*k == 0) { + *relmaxc2nrmk = 1.f; + } else { + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + } + + } else { + *maxc2nrmk = 0.f; + *relmaxc2nrmk = 0.f; + } + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, set TAUs corresponding to the columns that were */ +/* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO. */ + + i__1 = minmnfact; + for (j = *k + 1; j <= i__1; ++j) { + tau[j] = 0.f; + } + + return 0; + +/* End of SLAQP2RK */ + +} /* slaqp2rk_ */ + diff --git a/lapack-netlib/SRC/slaqp2rk.f b/lapack-netlib/SRC/slaqp2rk.f new file mode 100644 index 000000000..f88b0ce90 --- /dev/null +++ b/lapack-netlib/SRC/slaqp2rk.f @@ -0,0 +1,713 @@ +*> \brief \b SLAQP2RK computes truncated QR factorization with column pivoting of a real matrix block using Level 2 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SLAQP2RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, +* $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, +* $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, +* $ INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS +* REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER JPIV( * ) +* REAL A( LDA, * ), TAU( * ), VN1( * ), VN2( * ), +* $ WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLAQP2RK computes a truncated (rank K) or full rank Householder QR +*> factorization with column pivoting of a real matrix +*> block A(IOFFSET+1:M,1:N) as +*> +*> A * P(K) = Q(K) * R(K). +*> +*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N) +*> is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides matrix block B +*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**T * B. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M-IOFFSET,N), then this stopping +*> criterion is not used, factorize columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The second factorization stopping criterion. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The third factorization stopping criterion. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine SGEQP3RK. 1 <= KP1 <= N_orig_mat. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is DOUBLE PRECISION +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine SGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:K) below +*> the diagonal together with the array TAU represent +*> the orthogonal matrix Q(K) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:K) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS). +*> The left part A(IOFFSET+1:M,K+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(K)**T. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is REAL array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is REAL array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is REAL array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (N-1) +*> Used in SLARF subroutine to apply an elementary +*> reflector from the left. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp2rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, + $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, + $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, + $ INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS + REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER JPIV( * ) + REAL A( LDA, * ), TAU( * ), VN1( * ), VN2( * ), + $ WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + INTEGER I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT, + $ MINMNUPDT + REAL AIKK, HUGEVAL, TEMP, TEMP2, TOL3Z +* .. +* .. External Subroutines .. + EXTERNAL SLARF, SLARFG, SSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL SISNAN + INTEGER ISAMAX + REAL SLAMCH, SNRM2 + EXTERNAL SISNAN, SLAMCH, ISAMAX, SNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* +* MINMNUPDT is the smallest dimension +* of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which +* contains the submatrices A(IOFFSET+1:M,1:N) and +* B(IOFFSET+1:M,1:NRHS) as column blocks. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + KMAX = MIN( KMAX, MINMNFACT ) + TOL3Z = SQRT( SLAMCH( 'Epsilon' ) ) + HUGEVAL = SLAMCH( 'Overflow' ) +* +* Compute the factorization, KK is the lomn loop index. +* + DO KK = 1, KMAX +* + I = IOFFSET + KK +* + IF( I.EQ.1 ) THEN +* +* ============================================================ +* +* We are at the first column of the original whole matrix A, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + + KP = KP1 +* +* ============================================================ +* + ELSE +* +* ============================================================ +* +* Determine the pivot column in KK-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( KK-1 ) + ISAMAX( N-KK+1, VN1( KK ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,KK:N) in step KK. +* RELMAXC2NRMK will be computed later, after somecondition +* checks on MAXC2NRMK. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains NaN, and set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( SISNAN( MAXC2NRMK ) ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + INFO = K + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* Array TAU(K+1:MINMNFACT) is not set and contains +* undefined elements. +* + RETURN + END IF +* +* ============================================================ +* +* Quick return, if the submatrix A(I:M,KK:N) is +* a zero matrix. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + RELMAXC2NRMK = ZERO +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + KK - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third stopping criteria. +* NOTE: There is no need to test for ABSTOL >= ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL >= ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. + + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* +* Set K, the number of factorized columns. +* + K = KK - 1 +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,KK:N): +* 1) swap the KK-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) copy the KK-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than KK in the next loop step.) +* 3) Save the pivot interchange with the indices relative to the +* the original matrix A, not the block A(1:M,1:N). +* + IF( KP.NE.KK ) THEN + CALL SSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 ) + VN1( KP ) = VN1( KK ) + VN2( KP ) = VN2( KK ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( KK ) + JPIV( KK ) = ITEMP + END IF +* +* Generate elementary reflector H(KK) using the column A(I:M,KK), +* if the column has more than one element, otherwise +* the elementary reflector would be an identity matrix, +* and TAU(KK) = ZERO. +* + IF( I.LT.M ) THEN + CALL SLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1, + $ TAU( KK ) ) + ELSE + TAU( KK ) = ZERO + END IF +* +* Check if TAU(KK) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(KK) for Inf, +* since SLARFG cannot produce TAU(KK) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by SLARFG can contain Inf, which requires +* TAU(KK) to contain NaN. Therefore, this case of generating Inf +* by SLARFG is covered by checking TAU(KK) for NaN. +* + IF( SISNAN( TAU(KK) ) ) THEN + K = KK - 1 + INFO = KK +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAU( KK ) + RELMAXC2NRMK = TAU( KK ) +* +* Array TAU(KK:MINMNFACT) is not set and contains +* undefined elements, except the first element TAU(KK) = NaN. +* + RETURN + END IF +* +* Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left. +* ( If M >= N, then at KK = N there is no residual matrix, +* i.e. no columns of A to update, only columns of B. +* If M < N, then at KK = M-IOFFSET, I = M and we have a +* one-row residual matrix in A and the elementary +* reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update +* is needed for the residual matrix in A and the +* right-hand-side-matrix in B. +* Therefore, we update only if +* KK < MINMNUPDT = min(M-IOFFSET, N+NRHS) +* condition is satisfied, not only KK < N+NRHS ) +* + IF( KK.LT.MINMNUPDT ) THEN + AIKK = A( I, KK ) + A( I, KK ) = ONE + CALL SLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1, + $ TAU( KK ), A( I, KK+1 ), LDA, WORK( 1 ) ) + A( I, KK ) = AIKK + END IF +* + IF( KK.LT.MINMNFACT ) THEN +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. +* when KK < min(M-IOFFSET, N). +* + DO J = KK + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2 + TEMP = MAX( TEMP, ZERO ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2 .LE. TOL3Z ) THEN +* +* Compute the column 2-norm for the partial +* column A(I+1:M,J) by explicitly computing it, +* and store it in both partial 2-norm vector VN1 +* and exact column 2-norm vector VN2. +* + VN1( J ) = SNRM2( M-I, A( I+1, J ), 1 ) + VN2( J ) = VN1( J ) +* + ELSE +* +* Update the column 2-norm for the partial +* column A(I+1:M,J) by removing one +* element A(I,J) and store it in partial +* 2-norm vector VN1. +* + VN1( J ) = VN1( J )*SQRT( TEMP ) +* + END IF + END IF + END DO +* + END IF +* +* End factorization loop +* + END DO +* +* If we reached this point, all colunms have been factorized, +* i.e. no condition was triggered to exit the routine. +* Set the number of factorized columns. +* + K = KMAX +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before +* we return. +* + IF( K.LT.MINMNFACT ) THEN +* + JMAXC2NRM = K + ISAMAX( N-K, VN1( K+1 ), 1 ) + MAXC2NRMK = VN1( JMAXC2NRM ) +* + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + ELSE + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + END IF +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, set TAUs corresponding to the columns that were +* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO. +* + DO J = K + 1, MINMNFACT + TAU( J ) = ZERO + END DO +* + RETURN +* +* End of SLAQP2RK +* + END diff --git a/lapack-netlib/SRC/slaqp3rk.c b/lapack-netlib/SRC/slaqp3rk.c new file mode 100644 index 000000000..e3632538b --- /dev/null +++ b/lapack-netlib/SRC/slaqp3rk.c @@ -0,0 +1,1109 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + sgemm_("No transpose", "Transpose", &i__1, nrhs, kb, & + c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* Quick return, if the submatrix A(I:M,K:N) is */ +/* a zero matrix. We need to check it only if the column index */ +/* (same as row index) is larger than 1, since the condition */ +/* for the whole original matrix A_orig is checked in the main */ +/* routine. */ + + if (*maxc2nrmk == 0.f) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *relmaxc2nrmk = 0.f; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix is zero and we stop the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + sgemm_("No transpose", "Transpose", &i__1, nrhs, kb, & + c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + tau[j] = 0.f; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* Check if the submatrix A(I:M,K:N) contains Inf, */ +/* set INFO parameter to the column number, where */ +/* the first Inf is found plus N, and continue */ +/* the computation. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + if (*info == 0 && *maxc2nrmk > myhugeval) { + *info = *n + k - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third tolerance stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL.GE.ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig; */ + + *kb = k - 1; + if__ = i__ - 1; + +/* Apply the block reflector to the residual of the */ +/* matrix A and the residual of the right hand sides B, if */ +/* the residual matrix and and/or the residual of the right */ +/* hand sides exist, i.e. if the submatrix */ +/* A(I+1:M,KB+1:N+NRHS) exists. This occurs when */ +/* KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + sgemm_("No transpose", "Transpose", &i__1, &i__2, kb, & + c_b7, &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + + f_dim1], ldf, &c_b8, &a[if__ + 1 + (*kb + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + tau[j] = 0.f; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,K:N): */ +/* 1) swap the K-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */ +/* 3) copy the K-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. (Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than K in the next loop step.) */ +/* 4) Save the pivot interchange with the indices relative to the */ +/* the original matrix A_orig, not the block A(1:M,1:N). */ + + if (kp != k) { + sswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + i__1 = k - 1; + sswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf); + vn1[kp] = vn1[k]; + vn2[kp] = vn2[k]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[k]; + jpiv[k] = itemp; + } + +/* Apply previous Householder reflectors to column K: */ +/* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T. */ + + if (k > 1) { + i__1 = *m - i__ + 1; + i__2 = k - 1; + sgemv_("No transpose", &i__1, &i__2, &c_b7, &a[i__ + a_dim1], lda, + &f[k + f_dim1], ldf, &c_b8, &a[i__ + k * a_dim1], &c__1); + } + +/* Generate elementary reflector H(k) using the column A(I:M,K). */ + + if (i__ < *m) { + i__1 = *m - i__ + 1; + slarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], & + c__1, &tau[k]); + } else { + tau[k] = 0.f; + } + +/* Check if TAU(K) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(K) for Inf, */ +/* since SLARFG cannot produce TAU(K) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by SLARFG can contain Inf, which requires */ +/* TAU(K) to contain NaN. Therefore, this case of generating Inf */ +/* by SLARFG is covered by checking TAU(K) for NaN. */ + + if (sisnan_(&tau[k])) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *info = k; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = tau[k]; + *relmaxc2nrmk = tau[k]; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix contains NaN and we stop */ +/* the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + sgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &c_b7, & + a[if__ + 1 + a_dim1], lda, &f[*n + 1 + f_dim1], ldf, & + c_b8, &a[if__ + 1 + (*n + 1) * a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* =============================================================== */ + + aik = a[i__ + k * a_dim1]; + a[i__ + k * a_dim1] = 1.f; + +/* =============================================================== */ + +/* Compute the current K-th column of F: */ +/* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K). */ + + if (k < *n + *nrhs) { + i__1 = *m - i__ + 1; + i__2 = *n + *nrhs - k; + sgemv_("Transpose", &i__1, &i__2, &tau[k], &a[i__ + (k + 1) * + a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b30, &f[k + + 1 + k * f_dim1], &c__1); + } + +/* 2) Zero out elements above and on the diagonal of the */ +/* column K in matrix F, i.e elements F(1:K,K). */ + + i__1 = k; + for (j = 1; j <= i__1; ++j) { + f[j + k * f_dim1] = 0.f; + } + +/* 3) Incremental updating of the K-th column of F: */ +/* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T */ +/* * A(I:M,K). */ + + if (k > 1) { + i__1 = *m - i__ + 1; + i__2 = k - 1; + r__1 = -tau[k]; + sgemv_("Transpose", &i__1, &i__2, &r__1, &a[i__ + a_dim1], lda, & + a[i__ + k * a_dim1], &c__1, &c_b30, &auxv[1], &c__1); + + i__1 = *n + *nrhs; + i__2 = k - 1; + sgemv_("No transpose", &i__1, &i__2, &c_b8, &f[f_dim1 + 1], ldf, & + auxv[1], &c__1, &c_b8, &f[k * f_dim1 + 1], &c__1); + } + +/* =============================================================== */ + +/* Update the current I-th row of A: */ +/* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */ +/* - A(I,1:K)*F(K+1:N+NRHS,1:K)**T. */ + + if (k < *n + *nrhs) { + i__1 = *n + *nrhs - k; + sgemv_("No transpose", &i__1, &k, &c_b7, &f[k + 1 + f_dim1], ldf, + &a[i__ + a_dim1], lda, &c_b8, &a[i__ + (k + 1) * a_dim1], + lda); + } + + a[i__ + k * a_dim1] = aik; + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */ +/* when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */ + + if (k < minmnfact) { + + i__1 = *n; + for (j = k + 1; j <= i__1; ++j) { + if (vn1[j] != 0.f) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + + temp = (r__1 = a[i__ + j * a_dim1], abs(r__1)) / vn1[j]; +/* Computing MAX */ + r__1 = 0.f, r__2 = (temp + 1.f) * (1.f - temp); + temp = f2cmax(r__1,r__2); +/* Computing 2nd power */ + r__1 = vn1[j] / vn2[j]; + temp2 = temp * (r__1 * r__1); + if (temp2 <= tol3z) { + +/* At J-index, we have a difficult column for the */ +/* update of the 2-norm. Save the index of the previous */ +/* difficult column in IWORK(J-1). */ +/* NOTE: ILSTCC > 1, threfore we can use IWORK only */ +/* with N-1 elements, where the elements are */ +/* shifted by 1 to the left. */ + + iwork[j - 1] = lsticc; + +/* Set the index of the last difficult column LSTICC. */ + + lsticc = j; + + } else { + vn1[j] *= sqrt(temp); + } + } + } + + } + +/* End of while loop. */ + + } + +/* Now, afler the loop: */ +/* Set KB, the number of factorized columns in the block; */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig, IF = IOFFSET + KB. */ + + *kb = k; + if__ = i__; + +/* Apply the block reflector to the residual of the matrix A */ +/* and the residual of the right hand sides B, if the residual */ +/* matrix and and/or the residual of the right hand sides */ +/* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */ +/* This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + sgemm_("No transpose", "Transpose", &i__1, &i__2, kb, &c_b7, &a[if__ + + 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b8, &a[if__ + + 1 + (*kb + 1) * a_dim1], lda); + } + +/* Recompute the 2-norm of the difficult columns. */ +/* Loop over the index of the difficult columns from the largest */ +/* to the smallest index. */ + + while(lsticc > 0) { + +/* LSTICC is the index of the last difficult column is greater */ +/* than 1. */ +/* ITEMP is the index of the previous difficult column. */ + + itemp = iwork[lsticc - 1]; + +/* Compute the 2-norm explicilty for the last difficult column and */ +/* save it in the partial and exact 2-norm vectors VN1 and VN2. */ + +/* NOTE: The computation of VN1( LSTICC ) relies on the fact that */ +/* SNRM2 does not fail on vectors with norm below the value of */ +/* SQRT(SLAMCH('S')) */ + + i__1 = *m - if__; + vn1[lsticc] = snrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1); + vn2[lsticc] = vn1[lsticc]; + +/* Downdate the index of the last difficult column to */ +/* the index of the previous difficult column. */ + + lsticc = itemp; + + } + + return 0; + +/* End of SLAQP3RK */ + +} /* slaqp3rk_ */ + diff --git a/lapack-netlib/SRC/slaqp3rk.f b/lapack-netlib/SRC/slaqp3rk.f new file mode 100644 index 000000000..b2dc2b334 --- /dev/null +++ b/lapack-netlib/SRC/slaqp3rk.f @@ -0,0 +1,935 @@ +*> \brief \b SLAQP3RK computes a step of truncated QR factorization with column pivoting of a real m-by-n matrix A using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SLAQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, +* $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, +* $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) +* IMPLICIT NONE +* LOGICAL DONE +* INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, +* $ NB, NRHS +* REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* +* .. Scalar Arguments .. +* LOGICAL DONE +* INTEGER KB, LDA, LDF, M, N, NB, NRHS, IOFFSET +* REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* REAL A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ), +* $ VN1( * ), VN2( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLAQP3RK computes a step of truncated QR factorization with column +*> pivoting of a real M-by-N matrix A block A(IOFFSET+1:M,1:N) +*> by using Level 3 BLAS as +*> +*> A * P(KB) = Q(KB) * R(KB). +*> +*> The routine tries to factorize NB columns from A starting from +*> the row IOFFSET+1 and updates the residual matrix with BLAS 3 +*> xGEMM. The number of actually factorized columns is returned +*> is smaller than NB. +*> +*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides B matrix stored +*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**T * B. +*> +*> Cases when the number of factorized columns KB < NB: +*> +*> (1) In some cases, due to catastrophic cancellations, it cannot +*> factorize all NB columns and need to update the residual matrix. +*> Hence, the actual number of factorized columns in the block returned +*> in KB is smaller than NB. The logical DONE is returned as FALSE. +*> The factorization of the whole original matrix A_orig must proceed +*> with the next block. +*> +*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. +*> +*> (3) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix is a zero matrix in some factorization +*> step KB, the factorization of the whole original matrix A_orig is +*> stopped, the logical DONE is returned as TRUE. The number of +*> factorized columns which is smaller than NB is returned in KB. +*> +*> (4) Whenever NaN is detected in the matrix A or in the array TAU, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. The INFO +*> parameter is set to the column index of the first NaN occurrence. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0 +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> Factorization block size, i.e the number of columns +*> to factorize in the matrix A. 0 <= NB +*> +*> If NB = 0, then the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is REAL, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine SGEQP3RK. 1 <= KP1 <= N_orig. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is REAL +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine SGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:KB) below +*> the diagonal together with the array TAU represent +*> the orthogonal matrix Q(KB) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:KB) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS). +*> The left part A(IOFFSET+1:M,KB+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(KB)**T. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] +*> \verbatim +*> DONE is LOGICAL +*> TRUE: a) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to ABSTOL +*> or RELTOL criterion, +*> b) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to the +*> residual matrix being a ZERO matrix. +*> c) when NaN was detected in the matrix A +*> or in the array TAU. +*> FALSE: otherwise. +*> \endverbatim +*> +*> \param[out] KB +*> \verbatim +*> KB is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= KB <= min(M-IOFFSET,NB,N). +*> +*> KB also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is REAL +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank KB. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is REAL +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank KB) to the maximum column 2-norm of the +*> original matrix A_orig. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is REAL array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is REAL array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is REAL array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] AUXV +*> \verbatim +*> AUXV is REAL array, dimension (NB) +*> Auxiliary vector. +*> \endverbatim +*> +*> \param[out] F +*> \verbatim +*> F is REAL array, dimension (LDF,NB) +*> Matrix F**T = L*(Y**T)*A. +*> \endverbatim +*> +*> \param[in] LDF +*> \verbatim +*> LDF is INTEGER +*> The leading dimension of the array F. LDF >= max(1,N+NRHS). +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step KB+1 ( when KB columns +*> have been factorized ). +*> +*> On exit: +*> KB is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(KB+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=KB+1, TAU(KB+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the actorization +*> step KB+1 ( when KB columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp3rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, + $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + LOGICAL DONE + INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, + $ NB, NRHS + REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + REAL A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ), + $ VN1( * ), VN2( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + INTEGER ITEMP, J, K, MINMNFACT, MINMNUPDT, + $ LSTICC, KP, I, IF + REAL AIK, HUGEVAL, TEMP, TEMP2, TOL3Z +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SGEMV, SLARFG, SSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL SISNAN + INTEGER ISAMAX + REAL SLAMCH, SNRM2 + EXTERNAL SISNAN, SLAMCH, ISAMAX, SNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + NB = MIN( NB, MINMNFACT ) + TOL3Z = SQRT( SLAMCH( 'Epsilon' ) ) + HUGEVAL = SLAMCH( 'Overflow' ) +* +* Compute factorization in a while loop over NB columns, +* K is the column index in the block A(1:M,1:N). +* + K = 0 + LSTICC = 0 + DONE = .FALSE. +* + DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 ) + K = K + 1 + I = IOFFSET + K +* + IF( I.EQ.1 ) THEN +* +* We are at the first column of the original whole matrix A_orig, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + KP = KP1 +* + ELSE +* +* Determine the pivot column in K-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( K-1 ) + ISAMAX( N-K+1, VN1( K ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,K:N) in step K. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains NaN, set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( SISNAN( MAXC2NRMK ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = KB + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. + + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL SGEMM( 'No transpose', 'Transpose', + $ M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* Quick return, if the submatrix A(I:M,K:N) is +* a zero matrix. We need to check it only if the column index +* (same as row index) is larger than 1, since the condition +* for the whole original matrix A_orig is checked in the main +* routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + RELMAXC2NRMK = ZERO +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix is zero and we stop the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL SGEMM( 'No transpose', 'Transpose', + $ M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. +* + DO J = K, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + K - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third tolerance stopping criteria. +* NOTE: There is no need to test for ABSTOL.GE.ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig; +* + KB = K - 1 + IF = I - 1 +* +* Apply the block reflector to the residual of the +* matrix A and the residual of the right hand sides B, if +* the residual matrix and and/or the residual of the right +* hand sides exist, i.e. if the submatrix +* A(I+1:M,KB+1:N+NRHS) exists. This occurs when +* KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL SGEMM( 'No transpose', 'Transpose', + $ M-IF, N+NRHS-KB, KB,-ONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. +* + DO J = K, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,K:N): +* 1) swap the K-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) +* 3) copy the K-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. (Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than K in the next loop step.) +* 4) Save the pivot interchange with the indices relative to the +* the original matrix A_orig, not the block A(1:M,1:N). +* + IF( KP.NE.K ) THEN + CALL SSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 ) + CALL SSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF ) + VN1( KP ) = VN1( K ) + VN2( KP ) = VN2( K ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( K ) + JPIV( K ) = ITEMP + END IF +* +* Apply previous Householder reflectors to column K: +* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T. +* + IF( K.GT.1 ) THEN + CALL SGEMV( 'No transpose', M-I+1, K-1, -ONE, A( I, 1 ), + $ LDA, F( K, 1 ), LDF, ONE, A( I, K ), 1 ) + END IF +* +* Generate elementary reflector H(k) using the column A(I:M,K). +* + IF( I.LT.M ) THEN + CALL SLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) ) + ELSE + TAU( K ) = ZERO + END IF +* +* Check if TAU(K) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(K) for Inf, +* since SLARFG cannot produce TAU(K) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by SLARFG can contain Inf, which requires +* TAU(K) to contain NaN. Therefore, this case of generating Inf +* by SLARFG is covered by checking TAU(K) for NaN. +* + IF( SISNAN( TAU(K) ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = K +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAU( K ) + RELMAXC2NRMK = TAU( K ) +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL SGEMM( 'No transpose', 'Transpose', + $ M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* =============================================================== +* + AIK = A( I, K ) + A( I, K ) = ONE +* +* =============================================================== +* +* Compute the current K-th column of F: +* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K). +* + IF( K.LT.N+NRHS ) THEN + CALL SGEMV( 'Transpose', M-I+1, N+NRHS-K, + $ TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1, + $ ZERO, F( K+1, K ), 1 ) + END IF +* +* 2) Zero out elements above and on the diagonal of the +* column K in matrix F, i.e elements F(1:K,K). +* + DO J = 1, K + F( J, K ) = ZERO + END DO +* +* 3) Incremental updating of the K-th column of F: +* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T +* * A(I:M,K). +* + IF( K.GT.1 ) THEN + CALL SGEMV( 'Transpose', M-I+1, K-1, -TAU( K ), + $ A( I, 1 ), LDA, A( I, K ), 1, ZERO, + $ AUXV( 1 ), 1 ) +* + CALL SGEMV( 'No transpose', N+NRHS, K-1, ONE, + $ F( 1, 1 ), LDF, AUXV( 1 ), 1, ONE, + $ F( 1, K ), 1 ) + END IF +* +* =============================================================== +* +* Update the current I-th row of A: +* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) +* - A(I,1:K)*F(K+1:N+NRHS,1:K)**T. +* + IF( K.LT.N+NRHS ) THEN + CALL SGEMV( 'No transpose', N+NRHS-K, K, -ONE, + $ F( K+1, 1 ), LDF, A( I, 1 ), LDA, ONE, + $ A( I, K+1 ), LDA ) + END IF +* + A( I, K ) = AIK +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. +* when K < MINMNFACT = min( M-IOFFSET, N ). +* + IF( K.LT.MINMNFACT ) THEN +* + DO J = K + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ABS( A( I, J ) ) / VN1( J ) + TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2.LE.TOL3Z ) THEN +* +* At J-index, we have a difficult column for the +* update of the 2-norm. Save the index of the previous +* difficult column in IWORK(J-1). +* NOTE: ILSTCC > 1, threfore we can use IWORK only +* with N-1 elements, where the elements are +* shifted by 1 to the left. +* + IWORK( J-1 ) = LSTICC +* +* Set the index of the last difficult column LSTICC. +* + LSTICC = J +* + ELSE + VN1( J ) = VN1( J )*SQRT( TEMP ) + END IF + END IF + END DO +* + END IF +* +* End of while loop. +* + END DO +* +* Now, afler the loop: +* Set KB, the number of factorized columns in the block; +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig, IF = IOFFSET + KB. +* + KB = K + IF = I +* +* Apply the block reflector to the residual of the matrix A +* and the residual of the right hand sides B, if the residual +* matrix and and/or the residual of the right hand sides +* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. +* This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL SGEMM( 'No transpose', 'Transpose', + $ M-IF, N+NRHS-KB, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* Recompute the 2-norm of the difficult columns. +* Loop over the index of the difficult columns from the largest +* to the smallest index. +* + DO WHILE( LSTICC.GT.0 ) +* +* LSTICC is the index of the last difficult column is greater +* than 1. +* ITEMP is the index of the previous difficult column. +* + ITEMP = IWORK( LSTICC-1 ) +* +* Compute the 2-norm explicilty for the last difficult column and +* save it in the partial and exact 2-norm vectors VN1 and VN2. +* +* NOTE: The computation of VN1( LSTICC ) relies on the fact that +* SNRM2 does not fail on vectors with norm below the value of +* SQRT(SLAMCH('S')) +* + VN1( LSTICC ) = SNRM2( M-IF, A( IF+1, LSTICC ), 1 ) + VN2( LSTICC ) = VN1( LSTICC ) +* +* Downdate the index of the last difficult column to +* the index of the previous difficult column. +* + LSTICC = ITEMP +* + END DO +* + RETURN +* +* End of SLAQP3RK +* + END diff --git a/lapack-netlib/SRC/slaswlq.f b/lapack-netlib/SRC/slaswlq.f index 685f823a0..594c646db 100644 --- a/lapack-netlib/SRC/slaswlq.f +++ b/lapack-netlib/SRC/slaswlq.f @@ -96,22 +96,24 @@ *> The leading dimension of the array T. LDT >= MB. *> \endverbatim *> -*> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) -*> +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= MB * M. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim + *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -163,32 +165,35 @@ *> * ===================================================================== SUBROUTINE SLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), T( LDT, *) + REAL A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL SGELQT, SGEQRT, STPLQT, STPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -199,12 +204,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LE.0 ) THEN INFO = -4 @@ -212,60 +224,60 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = MB*M + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN - CALL SGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN + CALL SGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO ) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL SGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL SGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M, (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL STPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), - $ LDA, T(1, CTR * M + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL STPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + $ LDA, T(1, CTR * M + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL STPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), - $ LDA, T(1, CTR * M + 1), LDT, - $ WORK, INFO ) - END IF + $ LDA, T(1, CTR * M + 1), LDT, + $ WORK, INFO ) + END IF * - WORK( 1 ) = SROUNDUP_LWORK(M * MB) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLASWLQ diff --git a/lapack-netlib/SRC/slatrs3.f b/lapack-netlib/SRC/slatrs3.f index 8f0c4bf16..17052289e 100644 --- a/lapack-netlib/SRC/slatrs3.f +++ b/lapack-netlib/SRC/slatrs3.f @@ -151,13 +151,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK). -*> On exit, if INFO = 0, WORK(1) returns the optimal size of -*> WORK. +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -165,6 +168,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -181,7 +185,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -253,7 +257,7 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. @@ -264,7 +268,8 @@ EXTERNAL ILAENV, LSAME, SLAMCH, SLANGE, SLARMM * .. * .. External Subroutines .. - EXTERNAL SLATRS, SSCAL, XERBLA + REAL SROUNDUP_LWORK + EXTERNAL SLATRS, SSCAL, SROUNDUP_LWORK, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN @@ -292,15 +297,24 @@ * row. WORK( I + KK * LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * * Test the input parameters. * @@ -322,7 +336,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN @@ -650,6 +664,8 @@ END DO END DO RETURN +* + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * * End of SLATRS3 * diff --git a/lapack-netlib/SRC/slatsqr.f b/lapack-netlib/SRC/slatsqr.f index 86733bb15..4730815b5 100644 --- a/lapack-netlib/SRC/slatsqr.f +++ b/lapack-netlib/SRC/slatsqr.f @@ -101,15 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= NB*N. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -161,33 +164,39 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup latsqr +*> * ===================================================================== SUBROUTINE SLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), T(LDT, *) + REAL A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL SGEQRT, STPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -198,6 +207,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -205,64 +221,65 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = NB*N +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL SGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL SGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL SGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CALL SGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) * - CTR = 1 - DO I = MB+1, II-MB+N , (MB-N) + CTR = 1 + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL STPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, - $ T(1, CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL STPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + $ T(1, CTR * N + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL STPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, - $ T(1, CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + IF( II.LE.M ) THEN + CALL STPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + $ T(1, CTR * N + 1), LDT, + $ WORK, INFO ) + END IF * - work( 1 ) = N*NB + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLATSQR diff --git a/lapack-netlib/SRC/ssyevd.f b/lapack-netlib/SRC/ssyevd.f index a5e4638d6..2ae44fc81 100644 --- a/lapack-netlib/SRC/ssyevd.f +++ b/lapack-netlib/SRC/ssyevd.f @@ -96,8 +96,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, -*> dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -251,7 +250,7 @@ $ N*ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) ) LIOPT = LIWMIN END IF - WORK( 1 ) = SROUNDUP_LWORK(LOPT) + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) IWORK( 1 ) = LIOPT * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -335,7 +334,7 @@ IF( ISCALE.EQ.1 ) $ CALL SSCAL( N, ONE / SIGMA, W, 1 ) * - WORK( 1 ) = SROUNDUP_LWORK(LOPT) + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) IWORK( 1 ) = LIOPT * RETURN diff --git a/lapack-netlib/SRC/ssyevr.f b/lapack-netlib/SRC/ssyevr.f index 47e4d7cbf..870facd60 100644 --- a/lapack-netlib/SRC/ssyevr.f +++ b/lapack-netlib/SRC/ssyevr.f @@ -271,7 +271,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,26*N). +*> The dimension of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 26*N. *> For optimal efficiency, LWORK >= (NB+6)*N, *> where NB is the max of the blocksize for SSYTRD and SORMTR *> returned by ILAENV. @@ -292,7 +293,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK and @@ -392,8 +394,13 @@ * LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) ) * - LWMIN = MAX( 1, 26*N ) - LIWMIN = MAX( 1, 10*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 26*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -428,7 +435,7 @@ NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -677,7 +684,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/ssyevr_2stage.f b/lapack-netlib/SRC/ssyevr_2stage.f index a2d6a6231..471e25977 100644 --- a/lapack-netlib/SRC/ssyevr_2stage.f +++ b/lapack-netlib/SRC/ssyevr_2stage.f @@ -278,6 +278,7 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 5*N @@ -300,13 +301,14 @@ *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) -*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK. +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. *> \endverbatim *> *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the IWORK array, @@ -445,8 +447,14 @@ IB = ILAENV2STAGE( 2, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -485,7 +493,7 @@ * NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) * NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) * LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) IWORK( 1 ) = LIWMIN END IF * @@ -505,7 +513,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 26 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = A( 1, 1 ) @@ -733,7 +741,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/ssyevx.f b/lapack-netlib/SRC/ssyevx.f index 2204aa39b..aaed6dad5 100644 --- a/lapack-netlib/SRC/ssyevx.f +++ b/lapack-netlib/SRC/ssyevx.f @@ -338,14 +338,14 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWKMIN = 1 - WORK( 1 ) = SROUNDUP_LWORK(LWKMIN) + LWKOPT = 1 ELSE LWKMIN = 8*N NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( LWKMIN, ( NB + 3 )*N ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -17 @@ -542,7 +542,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssysv_aa.f b/lapack-netlib/SRC/ssysv_aa.f index e43d4de7f..711a275e1 100644 --- a/lapack-netlib/SRC/ssysv_aa.f +++ b/lapack-netlib/SRC/ssysv_aa.f @@ -177,12 +177,13 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS + INTEGER LWKMIN, LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, SSYTRS_AA, SSYTRF_AA @@ -196,6 +197,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -206,18 +208,18 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_SYTRF = INT( WORK(1) ) + LWKOPT_SYTRF = INT( WORK( 1 ) ) CALL SSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_SYTRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT_SYTRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_SYTRF, LWKOPT_SYTRS ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -239,7 +241,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f index 3d88e068e..fb068b3bf 100644 --- a/lapack-netlib/SRC/ssysv_aa_2stage.f +++ b/lapack-netlib/SRC/ssysv_aa_2stage.f @@ -100,14 +100,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is REAL array, dimension (LTB) +*> TB is REAL array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -147,14 +147,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL workspace of size LWORK +*> WORK is REAL workspace of size (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -204,12 +205,13 @@ * .. * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKMIN, LWKOPT * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSYTRF_AA_2STAGE, SSYTRS_AA_2STAGE, @@ -226,6 +228,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -234,18 +237,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -255,7 +259,6 @@ RETURN END IF * -* * Compute the factorization A = U**T*T*U or A = L*T*L**T. * CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, @@ -269,7 +272,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssysvx.f b/lapack-netlib/SRC/ssysvx.f index 0d72217eb..06a6413f1 100644 --- a/lapack-netlib/SRC/ssysvx.f +++ b/lapack-netlib/SRC/ssysvx.f @@ -305,7 +305,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKMIN, LWKOPT, NB REAL ANORM * .. * .. External Functions .. @@ -327,6 +327,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 3*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -344,12 +345,12 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 3*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 3*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) diff --git a/lapack-netlib/SRC/ssytrd_2stage.f b/lapack-netlib/SRC/ssytrd_2stage.f index 5d70ae0d4..5b401c3d0 100644 --- a/lapack-netlib/SRC/ssytrd_2stage.f +++ b/lapack-netlib/SRC/ssytrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download SSYTRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download SSYTRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * REAL A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the orthogonal *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is REAL array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is REAL array, dimension (LHOUS2) +*> HOUS2 is REAL array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -149,17 +151,19 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +181,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +206,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +214,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -265,10 +269,13 @@ * KD = ILAENV2STAGE( 1, 'SSYTRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'SSYTRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'SSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -309,14 +316,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRD_SY2SB', -INFO ) RETURN END IF - CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRD_SB2ST', -INFO ) @@ -324,8 +331,7 @@ END IF * * - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN RETURN * * End of SSYTRD_2STAGE diff --git a/lapack-netlib/SRC/ssytrd_sb2st.F b/lapack-netlib/SRC/ssytrd_sb2st.F index 32bae26dc..111eaa93e 100644 --- a/lapack-netlib/SRC/ssytrd_sb2st.F +++ b/lapack-netlib/SRC/ssytrd_sb2st.F @@ -132,15 +132,17 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is REAL array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is REAL array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension) +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error @@ -152,14 +154,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension LWORK. +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> IF N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension) +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -261,7 +266,7 @@ INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, + $ NBTILES, TTYPE, TID, NTHREADS, $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SISEV, SIZETAU, LDV, LHMIN, LWMIN @@ -283,7 +288,6 @@ * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -292,9 +296,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -315,8 +324,8 @@ END IF * IF( INFO.EQ.0 ) THEN - HOUS( 1 ) = LHMIN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + HOUS( 1 ) = SROUNDUP_LWORK( LHMIN ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -544,8 +553,7 @@ 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SSYTRD_SB2ST diff --git a/lapack-netlib/SRC/ssytrd_sy2sb.f b/lapack-netlib/SRC/ssytrd_sy2sb.f index 4efc43630..3996e07bb 100644 --- a/lapack-netlib/SRC/ssytrd_sy2sb.f +++ b/lapack-netlib/SRC/ssytrd_sy2sb.f @@ -124,7 +124,7 @@ *> \param[out] WORK *> \verbatim *> WORK is REAL array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY) +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -294,8 +296,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -314,7 +320,7 @@ CALL XERBLA( 'SSYTRD_SY2SB', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN END IF * @@ -507,7 +513,7 @@ END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SSYTRD_SY2SB diff --git a/lapack-netlib/SRC/ssytrf.f b/lapack-netlib/SRC/ssytrf.f index a788fbcf0..55f3a4f0f 100644 --- a/lapack-netlib/SRC/ssytrf.f +++ b/lapack-netlib/SRC/ssytrf.f @@ -234,7 +234,7 @@ * NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -353,7 +353,8 @@ END IF * 40 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF diff --git a/lapack-netlib/SRC/ssytrf_aa.f b/lapack-netlib/SRC/ssytrf_aa.f index d6408a978..af32fb064 100644 --- a/lapack-netlib/SRC/ssytrf_aa.f +++ b/lapack-netlib/SRC/ssytrf_aa.f @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= MAX(1,2*N). For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -128,7 +130,7 @@ *> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -142,19 +144,19 @@ * .. * .. Array Arguments .. INTEGER IPIV( * ) - REAL A( LDA, * ), WORK( * ) + REAL A( LDA, * ), WORK( * ) * .. * * ===================================================================== * .. Parameters .. - REAL ZERO, ONE + REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB - REAL ALPHA + REAL ALPHA * .. * .. External Functions .. LOGICAL LSAME @@ -180,19 +182,26 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -204,11 +213,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN RETURN END IF * @@ -458,7 +467,8 @@ END IF * 20 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF_AA diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f index abe6564c5..6b5cdee1b 100644 --- a/lapack-netlib/SRC/ssytrf_aa_2stage.f +++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f @@ -94,7 +94,7 @@ *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL workspace of size LWORK +*> WORK is REAL workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -212,9 +212,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -228,10 +228,10 @@ NB = ILAENV( 1, 'SSYTRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = SROUNDUP_LWORK( MAX( 1, (3*NB+1)*N ) ) END IF IF( WQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(N*NB) + WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N*NB ) ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -240,7 +240,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * diff --git a/lapack-netlib/SRC/ssytrf_rk.f b/lapack-netlib/SRC/ssytrf_rk.f index 72830543c..89ecf38fd 100644 --- a/lapack-netlib/SRC/ssytrf_rk.f +++ b/lapack-netlib/SRC/ssytrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension ( MAX(1,LWORK) ). +*> WORK is REAL array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -312,7 +312,7 @@ * NB = ILAENV( 1, 'SSYTRF_RK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -488,7 +488,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF_RK diff --git a/lapack-netlib/SRC/ssytrf_rook.f b/lapack-netlib/SRC/ssytrf_rook.f index 339a229e7..7c2cbbc57 100644 --- a/lapack-netlib/SRC/ssytrf_rook.f +++ b/lapack-netlib/SRC/ssytrf_rook.f @@ -118,7 +118,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -260,7 +260,7 @@ * NB = ILAENV( 1, 'SSYTRF_ROOK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -383,7 +383,8 @@ END IF * 40 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF_ROOK diff --git a/lapack-netlib/SRC/ssytri2.f b/lapack-netlib/SRC/ssytri2.f index 49f6cad65..fd1c53473 100644 --- a/lapack-netlib/SRC/ssytri2.f +++ b/lapack-netlib/SRC/ssytri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (N+NB+1)*(NB+3) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE SSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -147,7 +147,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSYTRI, SSYTRI2X, XERBLA @@ -159,9 +160,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +178,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = SROUNDUP_LWORK( MINSIZE ) RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - - IF( NBMAX .GE. N ) THEN +* + IF( NBMAX.GE.N ) THEN CALL SSYTRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL SSYTRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of SSYTRI2 diff --git a/lapack-netlib/SRC/ssytri_3.f b/lapack-netlib/SRC/ssytri_3.f index bca01105d..f0152a149 100644 --- a/lapack-netlib/SRC/ssytri_3.f +++ b/lapack-netlib/SRC/ssytri_3.f @@ -119,16 +119,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (N+NB+1)*(NB+3). +*> WORK is REAL array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= (N+NB+1)*(NB+3). +*> The length of WORK. +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> -*> If LDWORK = -1, then a workspace query is assumed; +*> If LWORK = -1, then a workspace query is assumed; *> the routine only calculates the optimal size of the optimal *> size of the WORK array, returns this value as the first *> entry of the WORK array, and no error message related to @@ -209,8 +210,13 @@ * * Determine the block size * - NB = MAX( 1, ILAENV( 1, 'SSYTRI_3', UPLO, N, -1, -1, -1 ) ) - LWKOPT = ( N+NB+1 ) * ( NB+3 ) + IF( N.EQ.0 ) THEN + LWKOPT = 1 + ELSE + NB = MAX( 1, ILAENV( 1, 'SSYTRI_3', UPLO, N, -1, -1, -1 ) ) + LWKOPT = ( N+NB+1 ) * ( NB+3 ) + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 @@ -218,7 +224,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN INFO = -8 END IF * @@ -226,7 +232,6 @@ CALL XERBLA( 'SSYTRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -237,7 +242,7 @@ * CALL SSYTRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO ) * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssytrs_aa.f b/lapack-netlib/SRC/ssytrs_aa.f index 12fca0c71..265cf0c1d 100644 --- a/lapack-netlib/SRC/ssytrs_aa.f +++ b/lapack-netlib/SRC/ssytrs_aa.f @@ -105,7 +105,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -141,7 +147,7 @@ * .. * .. Array Arguments .. INTEGER IPIV( * ) - REAL A( LDA, * ), B( LDB, * ), WORK( * ) + REAL A( LDA, * ), B( LDB, * ), WORK( * ) * .. * * ===================================================================== @@ -151,24 +157,31 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGTSV, SSWAP, SLACPY, STRSM, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -179,21 +192,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKMIN ) RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN diff --git a/lapack-netlib/SRC/zbbcsd.f b/lapack-netlib/SRC/zbbcsd.f index 757e98c71..6601f4a06 100644 --- a/lapack-netlib/SRC/zbbcsd.f +++ b/lapack-netlib/SRC/zbbcsd.f @@ -804,7 +804,7 @@ CALL DLARTGP( B22BULGE, B22E(I-1), RWORK(IU2SN+I-1), $ RWORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL DLARTGS( B21E(I), B21E(I+1), NU, RWORK(IU2CS+I-1), + CALL DLARTGS( B21E(I), B21D(I+1), NU, RWORK(IU2CS+I-1), $ RWORK(IU2SN+I-1) ) ELSE CALL DLARTGS( B22D(I), B22E(I), MU, RWORK(IU2CS+I-1), diff --git a/lapack-netlib/SRC/zgebrd.f b/lapack-netlib/SRC/zgebrd.f index f1791c6a4..c1a6169a7 100644 --- a/lapack-netlib/SRC/zgebrd.f +++ b/lapack-netlib/SRC/zgebrd.f @@ -122,7 +122,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -147,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gebrd * *> \par Further Details: * ===================== @@ -223,8 +224,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL XERBLA, ZGEBD2, ZGEMM, ZLABRD @@ -241,9 +242,17 @@ * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + END IF WORK( 1 ) = DBLE( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -251,7 +260,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -263,7 +272,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -282,7 +290,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using diff --git a/lapack-netlib/SRC/zgedmd.f90 b/lapack-netlib/SRC/zgedmd.f90 index 090641ad8..5045cb166 100644 --- a/lapack-netlib/SRC/zgedmd.f90 +++ b/lapack-netlib/SRC/zgedmd.f90 @@ -1,389 +1,539 @@ - SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & - M, N, X, LDX, Y, LDY, NRNK, TOL, & - K, EIGS, Z, LDZ, RES, B, LDB, & - W, LDW, S, LDS, ZWORK, LZWORK, & - RWORK, LRWORK, IWORK, LIWORK, INFO ) -! March 2023 -!..... - USE iso_fortran_env - IMPLICIT NONE - INTEGER, PARAMETER :: WP = real64 - -!..... -! Scalar arguments - CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF - INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & - NRNK, LDZ, LDB, LDW, LDS, & - LIWORK, LRWORK, LZWORK - INTEGER, INTENT(OUT) :: K, INFO - REAL(KIND=WP), INTENT(IN) :: TOL -! Array arguments - COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) - COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & - W(LDW,*), S(LDS,*) - COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) - COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) - REAL(KIND=WP), INTENT(OUT) :: RES(*) - REAL(KIND=WP), INTENT(OUT) :: RWORK(*) - INTEGER, INTENT(OUT) :: IWORK(*) -!............................................................ -! Purpose -! ======= -! ZGEDMD computes the Dynamic Mode Decomposition (DMD) for -! a pair of data snapshot matrices. For the input matrices -! X and Y such that Y = A*X with an unaccessible matrix -! A, ZGEDMD computes a certain number of Ritz pairs of A using -! the standard Rayleigh-Ritz extraction from a subspace of -! range(X) that is determined using the leading left singular -! vectors of X. Optionally, ZGEDMD returns the residuals -! of the computed Ritz pairs, the information needed for -! a refinement of the Ritz vectors, or the eigenvectors of -! the Exact DMD. -! For further details see the references listed -! below. For more details of the implementation see [3]. -! -! References -! ========== -! [1] P. Schmid: Dynamic mode decomposition of numerical -! and experimental data, -! Journal of Fluid Mechanics 656, 5-28, 2010. -! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal -! decompositions: analysis and enhancements, -! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. -! [3] Z. Drmac: A LAPACK implementation of the Dynamic -! Mode Decomposition I. Technical report. AIMDyn Inc. -! and LAPACK Working Note 298. -! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. -! Brunton, N. Kutz: On Dynamic Mode Decomposition: -! Theory and Applications, Journal of Computational -! Dynamics 1(2), 391 -421, 2014. +!> \brief \b ZGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== ! +! SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, EIGS, Z, LDZ, RES, B, LDB, & +! W, LDW, S, LDS, ZWORK, LZWORK, & +! RWORK, LRWORK, IWORK, LIWORK, INFO ) +!...... +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real64 +! +!...... +! Scalar arguments +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LIWORK, LRWORK, LZWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments +! COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) +! COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) +! REAL(KIND=WP), INTENT(OUT) :: RES(*) +! REAL(KIND=WP), INTENT(OUT) :: RWORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) +! +!............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> ZGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, ZGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, ZGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim !...................................................................... -! Developed and supported by: -! =========================== -! Developed and coded by Zlatko Drmac, Faculty of Science, -! University of Zagreb; drmac@math.hr -! In cooperation with -! AIMdyn Inc., Santa Barbara, CA. -! and supported by -! - DARPA SBIR project "Koopman Operator-Based Forecasting -! for Nonstationary Processes from Near-Term, Limited -! Observational Data" Contract No: W31P4Q-21-C-0007 -! - DARPA PAI project "Physics-Informed Machine Learning -! Methodologies" Contract No: HR0011-18-9-0033 -! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic -! Framework for Space-Time Analysis of Process Dynamics" -! Contract No: HR0011-16-C-0116 -! Any opinions, findings and conclusions or recommendations -! expressed in this material are those of the author and -! do not necessarily reflect the views of the DARPA SBIR -! Program Office -!============================================================ -! Distribution Statement A: -! Approved for Public Release, Distribution Unlimited. -! Cleared by DARPA on September 29, 2022 -!============================================================ +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim +!...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim !............................................................ ! Arguments ! ========= -! JOBS (input) CHARACTER*1 -! Determines whether the initial data snapshots are scaled -! by a diagonal matrix. -! 'S' :: The data snapshots matrices X and Y are multiplied -! with a diagonal matrix D so that X*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'C' :: The snapshots are scaled as with the 'S' option. -! If it is found that an i-th column of X is zero -! vector and the corresponding i-th column of Y is -! non-zero, then the i-th column of Y is set to -! zero and a warning flag is raised. -! 'Y' :: The data snapshots matrices X and Y are multiplied -! by a diagonal matrix D so that Y*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'N' :: No data scaling. +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim !..... -! JOBZ (input) CHARACTER*1 -! Determines whether the eigenvectors (Koopman modes) will -! be computed. -! 'V' :: The eigenvectors (Koopman modes) will be computed -! and returned in the matrix Z. -! See the description of Z. -! 'F' :: The eigenvectors (Koopman modes) will be returned -! in factored form as the product X(:,1:K)*W, where X -! contains a POD basis (leading left singular vectors -! of the data matrix X) and W contains the eigenvectors -! of the corresponding Rayleigh quotient. -! See the descriptions of K, X, W, Z. -! 'N' :: The eigenvectors are not computed. +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim !..... -! JOBR (input) CHARACTER*1 -! Determines whether to compute the residuals. -! 'R' :: The residuals for the computed eigenpairs will be -! computed and stored in the array RES. -! See the description of RES. -! For this option to be legal, JOBZ must be 'V'. -! 'N' :: The residuals are not computed. +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim !..... -! JOBF (input) CHARACTER*1 -! Specifies whether to store information needed for post- -! processing (e.g. computing refined Ritz vectors) -! 'R' :: The matrix needed for the refinement of the Ritz -! vectors is computed and stored in the array B. -! See the description of B. -! 'E' :: The unscaled eigenvectors of the Exact DMD are -! computed and returned in the array B. See the -! description of B. -! 'N' :: No eigenvector refinement data is computed. +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim !..... -! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } -! Allows for a selection of the SVD algorithm from the -! LAPACK library. -! 1 :: ZGESVD (the QR SVD algorithm) -! 2 :: ZGESDD (the Divide and Conquer algorithm; if enough -! workspace available, this is the fastest option) -! 3 :: ZGESVDQ (the preconditioned QR SVD ; this and 4 -! are the most accurate options) -! 4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 -! are the most accurate options) -! For the four methods above, a significant difference in -! the accuracy of small singular values is possible if -! the snapshots vary in norm so that X is severely -! ill-conditioned. If small (smaller than EPS*||X||) -! singular values are of interest and JOBS=='N', then -! the options (3, 4) give the most accurate results, where -! the option 4 is slightly better and with stronger -! theoretical background. -! If JOBS=='S', i.e. the columns of X will be normalized, -! then all methods give nearly equally accurate results. +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: ZGESVD (the QR SVD algorithm) +!> 2 :: ZGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: ZGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim !..... -! M (input) INTEGER, M>= 0 -! The state space dimension (the row dimension of X, Y). +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim !..... -! N (input) INTEGER, 0 <= N <= M -! The number of data snapshot pairs -! (the number of columns of X and Y). +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim !..... -! X (input/output) COMPLEX(KIND=WP) M-by-N array -! > On entry, X contains the data snapshot matrix X. It is -! assumed that the column norms of X are in the range of -! the normalized floating point numbers. -! < On exit, the leading K columns of X contain a POD basis, -! i.e. the leading K left singular vectors of the input -! data matrix X, U(:,1:K). All N columns of X contain all -! left singular vectors of the input matrix X. -! See the descriptions of K, Z and W. +!> \param[in] LDX +!> \verbatim +!> X (input/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. !..... -! LDX (input) INTEGER, LDX >= M -! The leading dimension of the array X. +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim !..... -! Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array -! > On entry, Y contains the data snapshot matrix Y -! < On exit, -! If JOBR == 'R', the leading K columns of Y contain -! the residual vectors for the computed Ritz pairs. -! See the description of RES. -! If JOBR == 'N', Y contains the original input data, -! scaled according to the value of JOBS. +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim !..... -! LDY (input) INTEGER , LDY >= M -! The leading dimension of the array Y. +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim !..... -! NRNK (input) INTEGER -! Determines the mode how to compute the numerical rank, -! i.e. how to truncate small singular values of the input -! matrix X. On input, if -! NRNK = -1 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(1) -! This option is recommended. -! NRNK = -2 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(i-1) -! This option is included for R&D purposes. -! It requires highly accurate SVD, which -! may not be feasible. -! The numerical rank can be enforced by using positive -! value of NRNK as follows: -! 0 < NRNK <= N :: at most NRNK largest singular values -! will be used. If the number of the computed nonzero -! singular values is less than NRNK, then only those -! nonzero values will be used and the actually used -! dimension is less than NRNK. The actual number of -! the nonzero singular values is returned in the variable -! K. See the descriptions of TOL and K. +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1) +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim !..... -! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 -! The tolerance for truncating small singular values. -! See the description of NRNK. +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim !..... -! K (output) INTEGER, 0 <= K <= N -! The dimension of the POD basis for the data snapshot -! matrix X and the number of the computed Ritz pairs. -! The value of K is determined according to the rule set -! by the parameters NRNK and TOL. -! See the descriptions of NRNK and TOL. +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim !..... -! EIGS (output) COMPLEX(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of EIGS contain -! the computed eigenvalues (Ritz values). -! See the descriptions of K, and Z. +!> \param[out] EIGS +!> \verbatim +!> EIGS (output) COMPLEX(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of EIGS contain +!> the computed eigenvalues (Ritz values). +!> See the descriptions of K, and Z. +!> \endverbatim !..... -! Z (workspace/output) COMPLEX(KIND=WP) M-by-N array -! If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) -! is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. -! If JOBZ == 'F', then the Z(:,i)'s are given implicitly as -! the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) -! is an eigenvector corresponding to EIGS(i). The columns -! of W(1:k,1:K) are the computed eigenvectors of the -! K-by-K Rayleigh quotient. -! See the descriptions of EIGS, X and W. +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) COMPLEX(KIND=WP) M-by-N array +!> If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) +!> is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. +!> If JOBZ == 'F', then the Z(:,i)'s are given implicitly as +!> the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) +!> is an eigenvector corresponding to EIGS(i). The columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. +!> See the descriptions of EIGS, X and W. +!> \endverbatim !..... -! LDZ (input) INTEGER , LDZ >= M -! The leading dimension of the array Z. +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim !..... -! RES (output) REAL(KIND=WP) N-by-1 array -! RES(1:K) contains the residuals for the K computed -! Ritz pairs, -! RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. -! See the description of EIGS and Z. +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs, +!> RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. +!> See the description of EIGS and Z. +!> \endverbatim !..... -! B (output) COMPLEX(KIND=WP) M-by-N array. -! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can -! be used for computing the refined vectors; see further -! details in the provided references. -! If JOBF == 'E', B(1:M,1:K) contains -! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the -! Exact DMD, up to scaling by the inverse eigenvalues. -! If JOBF =='N', then B is not referenced. -! See the descriptions of X, W, K. +!> \param[out] B +!> \verbatim +!> B (output) COMPLEX(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1:K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim !..... -! LDB (input) INTEGER, LDB >= M -! The leading dimension of the array B. +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim !..... -! W (workspace/output) COMPLEX(KIND=WP) N-by-N array -! On exit, W(1:K,1:K) contains the K computed -! eigenvectors of the matrix Rayleigh quotient. -! The Ritz vectors (returned in Z) are the -! product of X (containing a POD basis for the input -! matrix X) and W. See the descriptions of K, S, X and Z. -! W is also used as a workspace to temporarily store the -! right singular vectors of X. +!> \param[out] W +!> \verbatim +!> W (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient. +!> The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> right singular vectors of X. +!> \endverbatim !..... -! LDW (input) INTEGER, LDW >= N -! The leading dimension of the array W. +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim !..... -! S (workspace/output) COMPLEX(KIND=WP) N-by-N array -! The array S(1:K,1:K) is used for the matrix Rayleigh -! quotient. This content is overwritten during -! the eigenvalue decomposition by ZGEEV. -! See the description of K. +!> \param[out] S +!> \verbatim +!> S (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by ZGEEV. +!> See the description of K. +!> \endverbatim !..... -! LDS (input) INTEGER, LDS >= N -! The leading dimension of the array S. +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim !..... -! ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array -! ZWORK is used as complex workspace in the complex SVD, as -! specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing -! the eigenvalues of a Rayleigh quotient. -! If the call to ZGEDMD is only workspace query, then -! ZWORK(1) contains the minimal complex workspace length and -! ZWORK(2) is the optimal complex workspace length. -! Hence, the length of work is at least 2. -! See the description of LZWORK. +!> \param[out] ZWORK +!> \verbatim +!> ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array +!> ZWORK is used as complex workspace in the complex SVD, as +!> specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing +!> the eigenvalues of a Rayleigh quotient. +!> If the call to ZGEDMD is only workspace query, then +!> ZWORK(1) contains the minimal complex workspace length and +!> ZWORK(2) is the optimal complex workspace length. +!> Hence, the length of work is at least 2. +!> See the description of LZWORK. +!> \endverbatim !..... -! LZWORK (input) INTEGER -! The minimal length of the workspace vector ZWORK. -! LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV), -! where LZWORK_ZGEEV = MAX( 1, 2*N ) and the minimal -! LZWORK_SVD is calculated as follows -! If WHTSVD == 1 :: ZGESVD :: -! LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) -! If WHTSVD == 2 :: ZGESDD :: -! LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) -! If WHTSVD == 3 :: ZGESVDQ :: -! LZWORK_SVD = obtainable by a query -! If WHTSVD == 4 :: ZGEJSV :: -! LZWORK_SVD = obtainable by a query -! If on entry LZWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths and returns them in -! LZWORK(1) and LZWORK(2), respectively. +!> \param[in] LZWORK +!> \verbatim +!> LZWORK (input) INTEGER +!> The minimal length of the workspace vector ZWORK. +!> LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV), +!> where LZWORK_ZGEEV = MAX( 1, 2*N ) and the minimal +!> LZWORK_SVD is calculated as follows +!> If WHTSVD == 1 :: ZGESVD :: +!> LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) +!> If WHTSVD == 2 :: ZGESDD :: +!> LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) +!> If WHTSVD == 3 :: ZGESVDQ :: +!> LZWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: ZGEJSV :: +!> LZWORK_SVD = obtainable by a query +!> If on entry LZWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths and returns them in +!> LZWORK(1) and LZWORK(2), respectively. +!> \endverbatim !..... -! RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array -! On exit, RWORK(1:N) contains the singular values of -! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). -! If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain -! scaling factor RWORK(N+2)/RWORK(N+1) used to scale X -! and Y to avoid overflow in the SVD of X. -! This may be of interest if the scaling option is off -! and as many as possible smallest eigenvalues are -! desired to the highest feasible accuracy. -! If the call to ZGEDMD is only workspace query, then -! RWORK(1) contains the minimal workspace length. -! See the description of LRWORK. +!> \param[out] RWORK +!> \verbatim +!> RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array +!> On exit, RWORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain +!> scaling factor RWORK(N+2)/RWORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to ZGEDMD is only workspace query, then +!> RWORK(1) contains the minimal workspace length. +!> See the description of LRWORK. +!> \endverbatim !..... -! LRWORK (input) INTEGER -! The minimal length of the workspace vector RWORK. -! LRWORK is calculated as follows: -! LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where -! LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace -! for the SVD subroutine determined by the input parameter -! WHTSVD. -! If WHTSVD == 1 :: ZGESVD :: -! LRWORK_SVD = 5*MIN(M,N) -! If WHTSVD == 2 :: ZGESDD :: -! LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), -! 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) -! If WHTSVD == 3 :: ZGESVDQ :: -! LRWORK_SVD = obtainable by a query -! If WHTSVD == 4 :: ZGEJSV :: -! LRWORK_SVD = obtainable by a query -! If on entry LRWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! real workspace length and returns it in RWORK(1). +!> \param[in] LRWORK +!> \verbatim +!> LRWORK (input) INTEGER +!> The minimal length of the workspace vector RWORK. +!> LRWORK is calculated as follows: +!> LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where +!> LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace +!> for the SVD subroutine determined by the input parameter +!> WHTSVD. +!> If WHTSVD == 1 :: ZGESVD :: +!> LRWORK_SVD = 5*MIN(M,N) +!> If WHTSVD == 2 :: ZGESDD :: +!> LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), +!> 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) +!> If WHTSVD == 3 :: ZGESVDQ :: +!> LRWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: ZGEJSV :: +!> LRWORK_SVD = obtainable by a query +!> If on entry LRWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> real workspace length and returns it in RWORK(1). +!> \endverbatim !..... -! IWORK (workspace/output) INTEGER LIWORK-by-1 array -! Workspace that is required only if WHTSVD equals -! 2 , 3 or 4. (See the description of WHTSVD). -! If on entry LWORK =-1 or LIWORK=-1, then the -! minimal length of IWORK is computed and returned in -! IWORK(1). See the description of LIWORK. +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim !..... -! LIWORK (input) INTEGER -! The minimal length of the workspace vector IWORK. -! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 -! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) -! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) -! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) -! If on entry LIWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for ZWORK, RWORK and -! IWORK. See the descriptions of ZWORK, RWORK and IWORK. +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for ZWORK, RWORK and +!> IWORK. See the descriptions of ZWORK, RWORK and IWORK. +!> \endverbatim !..... -! INFO (output) INTEGER -! -i < 0 :: On entry, the i-th argument had an -! illegal value -! = 0 :: Successful return. -! = 1 :: Void input. Quick exit (M=0 or N=0). -! = 2 :: The SVD computation of X did not converge. -! Suggestion: Check the input data and/or -! repeat with different WHTSVD. -! = 3 :: The computation of the eigenvalues did not -! converge. -! = 4 :: If data scaling was requested on input and -! the procedure found inconsistency in the data -! such that for some column index i, -! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set -! to zero if JOBS=='C'. The computation proceeds -! with original or modified data and warning -! flag is set with INFO=4. +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! !............................................................. !............................................................. + SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, EIGS, Z, LDZ, RES, B, LDB, & + W, LDW, S, LDS, ZWORK, LZWORK, & + RWORK, LRWORK, IWORK, LIWORK, INFO ) +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real64 +! +! Scalar arguments +! ~~~~~~~~~~~~~~~~ + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LIWORK, LRWORK, LZWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! +! Array arguments +! ~~~~~~~~~~~~~~~ + COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) + COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) + REAL(KIND=WP), INTENT(OUT) :: RES(*) + REAL(KIND=WP), INTENT(OUT) :: RWORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +! ! Parameters ! ~~~~~~~~~~ REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) - +! ! Local scalars ! ~~~~~~~~~~~~~ REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & @@ -401,7 +551,7 @@ ! Local arrays ! ~~~~~~~~~~~~ REAL(KIND=WP) :: RDUMMY(2) - +! ! External functions (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~ REAL(KIND=WP) ZLANGE, DLAMCH, DZNRM2 @@ -409,13 +559,13 @@ INTEGER IZAMAX LOGICAL DISNAN, LSAME EXTERNAL DISNAN, LSAME - +! ! External subroutines (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~~~~ EXTERNAL ZAXPY, ZGEMM, ZDSCAL EXTERNAL ZGEEV, ZGEJSV, ZGESDD, ZGESVD, ZGESVDQ, & ZLACPY, ZLASCL, ZLASSQ, XERBLA - +! ! Intrinsic functions ! ~~~~~~~~~~~~~~~~~~~ INTRINSIC DBLE, INT, MAX, SQRT @@ -608,7 +758,8 @@ K = 0 DO i = 1, N !WORK(i) = DZNRM2( M, X(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL ZLASSQ( M, X(1,i), 1, SCALE, SSUM ) IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN K = 0 @@ -681,7 +832,8 @@ ! carefully computed using ZLASSQ. DO i = 1, N !RWORK(i) = DZNRM2( M, Y(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL ZLASSQ( M, Y(1,i), 1, SCALE, SSUM ) IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN K = 0 diff --git a/lapack-netlib/SRC/zgehrd.f b/lapack-netlib/SRC/zgehrd.f index e18493cf9..0f4424ded 100644 --- a/lapack-netlib/SRC/zgehrd.f +++ b/lapack-netlib/SRC/zgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gehrd * *> \par Further Details: * ===================== @@ -173,7 +173,7 @@ INTEGER IHI, ILO, INFO, LDA, LWORK, N * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) + COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) * .. * * ===================================================================== @@ -182,7 +182,7 @@ INTEGER NBMAX, LDT, TSIZE PARAMETER ( NBMAX = 64, LDT = NBMAX+1, $ TSIZE = LDT*NBMAX ) - COMPLEX*16 ZERO, ONE + COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) * .. @@ -190,7 +190,7 @@ LOGICAL LQUERY INTEGER I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB, $ NBMIN, NH, NX - COMPLEX*16 EI + COMPLEX*16 EI * .. * .. External Subroutines .. EXTERNAL ZAXPY, ZGEHD2, ZGEMM, ZLAHR2, ZLARFB, ZTRMM, @@ -221,12 +221,18 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'ZGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'ZGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + END IF WORK( 1 ) = LWKOPT ENDIF * @@ -248,7 +254,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -268,7 +273,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of diff --git a/lapack-netlib/SRC/zgelq.f b/lapack-netlib/SRC/zgelq.f index de7c9a378..86610e801 100644 --- a/lapack-netlib/SRC/zgelq.f +++ b/lapack-netlib/SRC/zgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -166,6 +166,8 @@ *> the LQ factorization. *> \endverbatim *> +*> \ingroup gelq +*> * ===================================================================== SUBROUTINE ZGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) diff --git a/lapack-netlib/SRC/zgelqf.f b/lapack-netlib/SRC/zgelqf.f index 6c295eece..e988ea818 100644 --- a/lapack-netlib/SRC/zgelqf.f +++ b/lapack-netlib/SRC/zgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -118,7 +119,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gelqf * *> \par Further Details: * ===================== @@ -174,9 +175,8 @@ * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'ZGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -184,19 +184,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = LWKOPT RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN diff --git a/lapack-netlib/SRC/zgemlq.f b/lapack-netlib/SRC/zgemlq.f index 41cd1c059..11489087a 100644 --- a/lapack-netlib/SRC/zgemlq.f +++ b/lapack-netlib/SRC/zgemlq.f @@ -109,16 +109,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -142,7 +143,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -158,11 +159,13 @@ *> block sizes MB and NB returned by ILAENV, ZGELQ will use either *> ZLASWLQ (if the matrix is wide-and-short) or ZGELQT to compute *> the LQ factorization. -*> This version of ZGEMLQ will use either ZLAMSWLQ or ZGEMLQT to +*> This version of ZGEMLQ will use either ZLAMSWLQ or ZGEMLQT to *> multiply matrix Q by another matrix. *> Further Details in ZLAMSWLQ or ZGEMLQT. *> \endverbatim *> +*> \ingroup gemlq +*> * ===================================================================== SUBROUTINE ZGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -184,7 +187,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -200,7 +203,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -215,6 +218,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -243,7 +253,7 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( ( LWORK.LT.LWMIN ) .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 END IF * @@ -260,7 +270,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * diff --git a/lapack-netlib/SRC/zgemqr.f b/lapack-netlib/SRC/zgemqr.f index c83eaff2f..d14d74fe2 100644 --- a/lapack-netlib/SRC/zgemqr.f +++ b/lapack-netlib/SRC/zgemqr.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -166,6 +167,8 @@ *> *> \endverbatim *> +*> \ingroup gemqr +*> * ===================================================================== SUBROUTINE ZGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -187,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -203,7 +206,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -218,6 +221,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -246,12 +256,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN @@ -263,7 +273,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -276,7 +286,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = LWMIN * RETURN * diff --git a/lapack-netlib/SRC/zgeqlf.f b/lapack-netlib/SRC/zgeqlf.f index 94721540c..a27612c64 100644 --- a/lapack-netlib/SRC/zgeqlf.f +++ b/lapack-netlib/SRC/zgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -113,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup geqlf * *> \par Further Details: * ===================== @@ -188,8 +189,9 @@ END IF WORK( 1 ) = LWKOPT * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * diff --git a/lapack-netlib/SRC/zgeqp3rk.c b/lapack-netlib/SRC/zgeqp3rk.c new file mode 100644 index 000000000..0c8b41c2d --- /dev/null +++ b/lapack-netlib/SRC/zgeqp3rk.c @@ -0,0 +1,1074 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + +/* Check if the matrix A contains +Inf or -Inf, set INFO parameter */ +/* to the column number, where the first +/-Inf is found plus N, */ +/* and continue the computation. */ + + *info = *n + kp1; + + } + +/* ================================================================== */ + +/* Quick return if possible for the case when the first */ +/* stopping criterion is satisfied, i.e. KMAX = 0. */ + + if (*kmax == 0) { + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.; + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + z__1.r = (doublereal) lwkopt, z__1.i = 0.; + work[1].r = z__1.r, work[1].i = z__1.i; + return 0; + } + +/* ================================================================== */ + + eps = dlamch_("Epsilon"); + +/* Adjust ABSTOL */ + + if (*abstol >= 0.) { + safmin = dlamch_("Safe minimum"); +/* Computing MAX */ + d__1 = *abstol, d__2 = safmin * 2.; + *abstol = f2cmax(d__1,d__2); + } + +/* Adjust RELTOL */ + + if (*reltol >= 0.) { + *reltol = f2cmax(*reltol,eps); + } + +/* =================================================================== */ + +/* JMAX is the maximum index of the column to be factorized, */ +/* which is also limited by the first stopping criterion KMAX. */ + + jmax = f2cmin(*kmax,minmn); + +/* =================================================================== */ + +/* Quick return if possible for the case when the second or third */ +/* stopping criterion for the whole original matrix is satified, */ +/* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */ +/* (which is ONE <= RELTOL). */ + + if (maxc2nrm <= *abstol || 1. <= *reltol) { + + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.; + + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + + z__1.r = (doublereal) lwkopt, z__1.i = 0.; + work[1].r = z__1.r, work[1].i = z__1.i; + return 0; + } + +/* ================================================================== */ +/* Factorize columns */ +/* ================================================================== */ + +/* Determine the block size. */ + + nbmin = 2; + nx = 0; + + if (nb > 1 && nb < minmn) { + +/* Determine when to cross over from blocked to unblocked code. */ +/* (for N less than NX, unblocked code should be used). */ + +/* Computing MAX */ + i__1 = 0, i__2 = ilaenv_(&c__3, "ZGEQP3RK", " ", m, n, &c_n1, &c_n1, ( + ftnlen)8, (ftnlen)1); + nx = f2cmax(i__1,i__2); + + if (nx < minmn) { + +/* Determine if workspace is large enough for blocked code. */ + + if (*lwork < lwkopt) { + +/* Not enough workspace to use optimal block size that */ +/* is currently stored in NB. */ +/* Reduce NB and determine the minimum value of NB. */ + + nb = (*lwork - (*n << 1)) / (*n + 1); +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "ZGEQP3RK", " ", m, n, &c_n1, + &c_n1, (ftnlen)8, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + } + } + } + +/* ================================================================== */ + +/* DONE is the boolean flag to rerpresent the case when the */ +/* factorization completed in the block factorization routine, */ +/* before the end of the block. */ + + done = FALSE_; + +/* J is the column index. */ + + j = 1; + +/* (1) Use blocked code initially. */ + +/* JMAXB is the maximum column index of the block, when the */ +/* blocked code is used, is also limited by the first stopping */ +/* criterion KMAX. */ + +/* Computing MIN */ + i__1 = *kmax, i__2 = minmn - nx; + jmaxb = f2cmin(i__1,i__2); + + if (nb >= nbmin && nb < jmax && jmaxb > 0) { + +/* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */ +/* J is the column index of a column block; */ +/* JB is the column block size to pass to block factorization */ +/* routine in a loop step; */ +/* JBF is the number of columns that were actually factorized */ +/* that was returned by the block factorization routine */ +/* in a loop step, JBF <= JB; */ +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + while(j <= jmaxb) { + +/* Computing MIN */ + i__1 = nb, i__2 = jmaxb - j + 1; + jb = f2cmin(i__1,i__2); + n_sub__ = *n - j + 1; + ioffset = j - 1; + +/* Factorize JB columns among the columns A(J:N). */ + + i__1 = *n + *nrhs - j + 1; + zlaqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, + &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, + maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], & + rwork[*n + j], &work[1], &work[jb + 1], &i__1, &iwork[1], + &iinfo); + +/* Set INFO on the first occurence of Inf. */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } + + if (done) { + +/* Either the submatrix is zero before the end of the */ +/* column block, or ABSTOL or RELTOL criterion is */ +/* satisfied before the end of the column block, we can */ +/* return from the routine. Perform the following before */ +/* returning: */ +/* a) Set the number of factorized columns K, */ +/* K = IOFFSET + JBF from the last call of blocked */ +/* routine. */ +/* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */ +/* by the block factorization routine; */ +/* 2) The remaining TAUs are set to ZERO by the */ +/* block factorization routine. */ + + *k = ioffset + jbf; + +/* Set INFO on the first occurrence of NaN, NaN takes */ +/* prcedence over Inf. */ + + if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + +/* Return from the routine. */ + + z__1.r = (doublereal) lwkopt, z__1.i = 0.; + work[1].r = z__1.r, work[1].i = z__1.i; + + return 0; + + } + + j += jbf; + + } + + } + +/* Use unblocked code to factor the last or only block. */ +/* J = JMAX+1 means we factorized the maximum possible number of */ +/* columns, that is in ELSE clause we need to compute */ +/* the MAXC2NORM and RELMAXC2NORM to return after we processed */ +/* the blocks. */ + + if (j <= jmax) { + +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + n_sub__ = *n - j + 1; + ioffset = j - 1; + + i__1 = jmax - j + 1; + zlaqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, & + maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, + relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], &rwork[*n + j], & + work[1], &iinfo); + +/* ABSTOL or RELTOL criterion is satisfied when the number of */ +/* the factorized columns KF is smaller then the number */ +/* of columns JMAX-J+1 supplied to be factorized by the */ +/* unblocked routine, we can return from */ +/* the routine. Perform the following before returning: */ +/* a) Set the number of factorized columns K, */ +/* b) MAXC2NRMK and RELMAXC2NRMK are returned by the */ +/* unblocked factorization routine above. */ + + *k = j - 1 + kf; + +/* Set INFO on the first exception occurence. */ + +/* Set INFO on the first exception occurence of Inf or NaN, */ +/* (NaN takes precedence over Inf). */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } else if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + + } else { + +/* Compute the return values for blocked code. */ + +/* Set the number of factorized columns if the unblocked routine */ +/* was not called. */ + + *k = jmax; + +/* If there exits a residual matrix after the blocked code: */ +/* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */ +/* residual matrix, otherwise set them to ZERO; */ +/* 2) Set TAU(K+1:MINMN) to ZERO. */ + + if (*k < minmn) { + i__1 = *n - *k; + jmaxc2nrm = *k + idamax_(&i__1, &rwork[*k + 1], &c__1); + *maxc2nrmk = rwork[jmaxc2nrm]; + if (*k == 0) { + *relmaxc2nrmk = 1.; + } else { + *relmaxc2nrmk = *maxc2nrmk / maxc2nrm; + } + + i__1 = minmn; + for (j = *k + 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + + } else { + *maxc2nrmk = 0.; + *relmaxc2nrmk = 0.; + + } + +/* END IF( J.LE.JMAX ) THEN */ + + } + + z__1.r = (doublereal) lwkopt, z__1.i = 0.; + work[1].r = z__1.r, work[1].i = z__1.i; + + return 0; + +/* End of ZGEQP3RK */ + +} /* zgeqp3rk_ */ + diff --git a/lapack-netlib/SRC/zgeqp3rk.f b/lapack-netlib/SRC/zgeqp3rk.f new file mode 100644 index 000000000..01dcce0de --- /dev/null +++ b/lapack-netlib/SRC/zgeqp3rk.f @@ -0,0 +1,1092 @@ +*> \brief \b ZGEQP3RK computes a truncated Householder QR factorization with column pivoting of a complex m-by-n matrix A by using Level 3 BLAS and overwrites m-by-nrhs matrix B with Q**H * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZGEQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, +* $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ WORK, LWORK, RWORK, IWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, K, KMAX, LDA, LWORK, M, N, NRHS +* DOUBLE PRECISION ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* DOUBLE PRECISION RWORK( * ) +* COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZGEQP3RK performs two tasks simultaneously: +*> +*> Task 1: The routine computes a truncated (rank K) or full rank +*> Householder QR factorization with column pivoting of a complex +*> M-by-N matrix A using Level 3 BLAS. K is the number of columns +*> that were factorized, i.e. factorization rank of the +*> factor R, K <= min(M,N). +*> +*> A * P(K) = Q(K) * R(K) = +*> +*> = Q(K) * ( R11(K) R12(K) ) = Q(K) * ( R(K)_approx ) +*> ( 0 R22(K) ) ( 0 R(K)_residual ), +*> +*> where: +*> +*> P(K) is an N-by-N permutation matrix; +*> Q(K) is an M-by-M unitary matrix; +*> R(K)_approx = ( R11(K), R12(K) ) is a rank K approximation of the +*> full rank factor R with K-by-K upper-triangular +*> R11(K) and K-by-N rectangular R12(K). The diagonal +*> entries of R11(K) appear in non-increasing order +*> of absolute value, and absolute values of all of +*> them exceed the maximum column 2-norm of R22(K) +*> up to roundoff error. +*> R(K)_residual = R22(K) is the residual of a rank K approximation +*> of the full rank factor R. It is a +*> an (M-K)-by-(N-K) rectangular matrix; +*> 0 is a an (M-K)-by-K zero matrix. +*> +*> Task 2: At the same time, the routine overwrites a complex M-by-NRHS +*> matrix B with Q(K)**H * B using Level 3 BLAS. +*> +*> ===================================================================== +*> +*> The matrices A and B are stored on input in the array A as +*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS) +*> respectively. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> The truncation criteria (i.e. when to stop the factorization) +*> can be any of the following: +*> +*> 1) The input parameter KMAX, the maximum number of columns +*> KMAX to factorize, i.e. the factorization rank is limited +*> to KMAX. If KMAX >= min(M,N), the criterion is not used. +*> +*> 2) The input parameter ABSTOL, the absolute tolerance for +*> the maximum column 2-norm of the residual matrix R22(K). This +*> means that the factorization stops if this norm is less or +*> equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used. +*> +*> 3) The input parameter RELTOL, the tolerance for the maximum +*> column 2-norm matrix of the residual matrix R22(K) divided +*> by the maximum column 2-norm of the original matrix A, which +*> is equal to abs(R(1,1)). This means that the factorization stops +*> when the ratio of the maximum column 2-norm of R22(K) to +*> the maximum column 2-norm of A is less than or equal to RELTOL. +*> If RELTOL < 0.0, the criterion is not used. +*> +*> 4) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix R22(K) is a zero matrix in some +*> factorization step K. ( This stopping criterion is implicit. ) +*> +*> The algorithm stops when any of these conditions is first +*> satisfied, otherwise the whole matrix A is factorized. +*> +*> To factorize the whole matrix A, use the values +*> KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0. +*> +*> The routine returns: +*> a) Q(K), R(K)_approx = ( R11(K), R12(K) ), +*> R(K)_residual = R22(K), P(K), i.e. the resulting matrices +*> of the factorization; P(K) is represented by JPIV, +*> ( if K = min(M,N), R(K)_approx is the full factor R, +*> and there is no residual matrix R(K)_residual); +*> b) K, the number of columns that were factorized, +*> i.e. factorization rank; +*> c) MAXC2NRMK, the maximum column 2-norm of the residual +*> matrix R(K)_residual = R22(K), +*> ( if K = min(M,N), MAXC2NRMK = 0.0 ); +*> d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum +*> column 2-norm of the original matrix A, which is equal +*> to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 ); +*> e) Q(K)**H * B, the matrix B with the unitary +*> transformation Q(K)**H applied on the left. +*> +*> The N-by-N permutation matrix P(K) is stored in a compact form in +*> the integer array JPIV. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The M-by-M unitary matrix Q is represented as a product +*> of elementary Householder reflectors +*> +*> Q(K) = H(1) * H(2) * . . . * H(K), +*> +*> where K is the number of columns that were factorized. +*> +*> Each H(j) has the form +*> +*> H(j) = I - tau * v * v**H, +*> +*> where 1 <= j <= K and +*> I is an M-by-M identity matrix, +*> tau is a complex scalar, +*> v is a complex vector with v(1:j-1) = 0 and v(j) = 1. +*> +*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j). +*> +*> See the Further Details section for more information. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e. the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M,N), then this stopping criterion +*> is not used, the routine factorizes columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B are not modified, and +*> the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION +*> +*> The second factorization stopping criterion, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix R22(K). +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix R22(K) +*> is less than or equal to ABSTOL. Let SAFMIN = DLAMCH('S'). +*> +*> a) If ABSTOL is NaN, then no computation is performed +*> and an error message ( INFO = -5 ) is issued +*> by XERBLA. +*> +*> b) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN +*> is used. This includes the case ABSTOL = -0.0. +*> +*> d) If 2*SAFMIN <= ABSTOL then the input value +*> of ABSTOL is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If ABSTOL chosen above is >= MAXC2NRM, then this +*> stopping criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. The routine +*> returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case ABSTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION +*> +*> The third factorization stopping criterion, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio +*> abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of +*> the residual matrix R22(K) to the maximum column 2-norm of +*> the original matrix A. The algorithm converges (stops the +*> factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less +*> than or equal to RELTOL. Let EPS = DLAMCH('E'). +*> +*> a) If RELTOL is NaN, then no computation is performed +*> and an error message ( INFO = -6 ) is issued +*> by XERBLA. +*> +*> b) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used. +*> This includes the case RELTOL = -0.0. +*> +*> d) If EPS <= RELTOL then the input value of RELTOL +*> is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If RELTOL chosen above is >= 1.0, then this stopping +*> criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. +*> The routine returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case RELTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> +*> NOTE: We recommend that RELTOL satisfy +*> min( 10*max(M,N)*EPS, sqrt(EPS) ) <= RELTOL +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N+NRHS) +*> +*> On entry: +*> +*> a) The subarray A(1:M,1:N) contains the M-by-N matrix A. +*> b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS +*> matrix B. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> +*> a) The subarray A(1:M,1:N) contains parts of the factors +*> of the matrix A: +*> +*> 1) If K = 0, A(1:M,1:N) contains the original matrix A. +*> 2) If K > 0, A(1:M,1:N) contains parts of the +*> factors: +*> +*> 1. The elements below the diagonal of the subarray +*> A(1:M,1:K) together with TAU(1:K) represent the +*> unitary matrix Q(K) as a product of K Householder +*> elementary reflectors. +*> +*> 2. The elements on and above the diagonal of +*> the subarray A(1:K,1:N) contain K-by-N +*> upper-trapezoidal matrix +*> R(K)_approx = ( R11(K), R12(K) ). +*> NOTE: If K=min(M,N), i.e. full rank factorization, +*> then R_approx(K) is the full factor R which +*> is upper-trapezoidal. If, in addition, M>=N, +*> then R is upper-triangular. +*> +*> 3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K) +*> rectangular matrix R(K)_residual = R22(K). +*> +*> b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains +*> the M-by-NRHS product Q(K)**H * B. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> This is the leading dimension for both matrices, A and B. +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> +*> NOTE: If K = 0, a) the arrays A and B are not modified; +*> b) the array TAU(1:min(M,N)) is set to ZERO, +*> if the matrix A does not contain NaN, +*> otherwise the elements TAU(1:min(M,N)) +*> are undefined; +*> c) the elements of the array JPIV are set +*> as follows: for j = 1:N, JPIV(j) = j. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix R22(K), +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then MAXC2NRMK equals the maximum column 2-norm +*> of the original matrix A. +*> +*> b) If 0 < K < min(M,N), then MAXC2NRMK is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then MAXC2NRMK = 0.0. +*> +*> NOTE: MAXC2NRMK in the factorization step K would equal +*> R(K+1,K+1) in the next factorization step K+1. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix R22(K) (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then RELMAXC2NRMK = 1.0. +*> +*> b) If 0 < K < min(M,N), then +*> RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then RELMAXC2NRMK = 0.0. +*> +*> NOTE: RELMAXC2NRMK in the factorization step K would equal +*> abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization +*> step K+1. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The elements of the array JPIV(1:N) are always set +*> by the routine, for example, even when no columns +*> were factorized, i.e. when K = 0, the elements are +*> set as JPIV(j) = j for j = 1:N. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX*16 array, dimension (min(M,N)) +*> The scalar factors of the elementary reflectors. +*> +*> If 0 < K <= min(M,N), only the elements TAU(1:K) of +*> the array TAU are modified by the factorization. +*> After the factorization computed, if no NaN was found +*> during the factorization, the remaining elements +*> TAU(K+1:min(M,N)) are set to zero, otherwise the +*> elements TAU(K+1:min(M,N)) are not set and therefore +*> undefined. +*> ( If K = 0, all elements of TAU are set to zero, if +*> the matrix A does not contain NaN. ) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= N+NRHS-1, otherwise. +*> For optimal performance LWORK >= NB*( N+NRHS+1 ), +*> where NB is the optimal block size for ZGEQP3RK returned +*> by ILAENV. Minimal block size MINNB=2. +*> +*> NOTE: The decision, whether to use unblocked BLAS 2 +*> or blocked BLAS 3 code is based not only on the dimension +*> LWORK of the availbale workspace WORK, but also also on the +*> matrix A dimension N via crossover point NX returned +*> by ILAENV. (For N less than NX, unblocked code should be +*> used.) +*> +*> If LWORK = -1, then a workspace query is assumed; +*> the routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (2*N) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix in the blocked step auxiliary subroutine ZLAQP3RK ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) INFO < 0: if INFO = -i, the i-th argument had an +*> illegal value. +*> 3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup geqp3rk +* +*> \par Further Details: +* ===================== +* +*> \verbatim +*> ZGEQP3RK is based on the same BLAS3 Householder QR factorization +*> algorithm with column pivoting as in ZGEQP3 routine which uses +*> ZLARFG routine to generate Householder reflectors +*> for QR factorization. +*> +*> We can also write: +*> +*> A = A_approx(K) + A_residual(K) +*> +*> The low rank approximation matrix A(K)_approx from +*> the truncated QR factorization of rank K of the matrix A is: +*> +*> A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T +*> ( 0 0 ) +*> +*> = Q(K) * ( R11(K) R12(K) ) * P(K)**T +*> ( 0 0 ) +*> +*> The residual A_residual(K) of the matrix A is: +*> +*> A_residual(K) = Q(K) * ( 0 0 ) * P(K)**T = +*> ( 0 R(K)_residual ) +*> +*> = Q(K) * ( 0 0 ) * P(K)**T +*> ( 0 R22(K) ) +*> +*> The truncated (rank K) factorization guarantees that +*> the maximum column 2-norm of A_residual(K) is less than +*> or equal to MAXC2NRMK up to roundoff error. +*> +*> NOTE: An approximation of the null vectors +*> of A can be easily computed from R11(K) +*> and R12(K): +*> +*> Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) ) +*> ( -I ) +*> +*> \endverbatim +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, + $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ WORK, LWORK, RWORK, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS + DOUBLE PRECISION ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + DOUBLE PRECISION RWORK( * ) + COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER INB, INBMIN, IXOVER + PARAMETER ( INB = 1, INBMIN = 2, IXOVER = 3 ) + DOUBLE PRECISION ZERO, ONE, TWO + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 ) + COMPLEX*16 CZERO + PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, DONE + INTEGER IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX, + $ JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB, + $ NBMIN, NX + DOUBLE PRECISION EPS, HUGEVAL, MAXC2NRM, SAFMIN +* .. +* .. External Subroutines .. + EXTERNAL ZLAQP2RK, ZLAQP3RK, XERBLA +* .. +* .. External Functions .. + LOGICAL DISNAN + INTEGER IDAMAX, ILAENV + DOUBLE PRECISION DLAMCH, DZNRM2 + EXTERNAL DISNAN, DLAMCH, DZNRM2, IDAMAX, ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test input arguments +* ==================== +* + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( KMAX.LT.0 ) THEN + INFO = -4 + ELSE IF( DISNAN( ABSTOL ) ) THEN + INFO = -5 + ELSE IF( DISNAN( RELTOL ) ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -8 + END IF +* +* If the input parameters M, N, NRHS, KMAX, LDA are valid: +* a) Test the input workspace size LWORK for the minimum +* size requirement IWS. +* b) Determine the optimal block size NB and optimal +* workspace size LWKOPT to be returned in WORK(1) +* in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., +* (3) when routine exits. +* Here, IWS is the miminum workspace required for unblocked +* code. +* + IF( INFO.EQ.0 ) THEN + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + IWS = 1 + LWKOPT = 1 + ELSE +* +* Minimal workspace size in case of using only unblocked +* BLAS 2 code in ZLAQP2RK. +* 1) ZLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in ZLARF subroutine inside ZLAQP2RK to apply an +* elementary reflector from the left. +* TOTAL_WORK_SIZE = 3*N + NRHS - 1 +* + IWS = N + NRHS - 1 +* +* Assign to NB optimal block size. +* + NB = ILAENV( INB, 'ZGEQP3RK', ' ', M, N, -1, -1 ) +* +* A formula for the optimal workspace size in case of using +* both unblocked BLAS 2 in ZLAQP2RK and blocked BLAS 3 code +* in ZLAQP3RK. +* 1) ZGEQP3RK, ZLAQP2RK, ZLAQP3RK: 2*N to store full and +* partial column 2-norms. +* 2) ZLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in ZLARF subroutine to apply an elementary reflector +* from the left. +* 3) ZLAQP3RK: NB*(N+NRHS) to use in the work array F that +* is used to apply a block reflector from +* the left. +* 4) ZLAQP3RK: NB to use in the auxilixary array AUX. +* Sizes (2) and ((3) + (4)) should intersect, therefore +* TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. +* + LWKOPT = 2*N + NB*( N+NRHS+1 ) + END IF + WORK( 1 ) = DCMPLX( LWKOPT ) +* + IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN + INFO = -15 + END IF + END IF +* +* NOTE: The optimal workspace size is returned in WORK(1), if +* the input parameters M, N, NRHS, KMAX, LDA are valid. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGEQP3RK', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible for M=0 or N=0. +* + IF( MINMN.EQ.0 ) THEN + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + WORK( 1 ) = DCMPLX( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* +* Initialize column pivot array JPIV. +* + DO J = 1, N + JPIV( J ) = J + END DO +* +* ================================================================== +* +* Initialize storage for partial and exact column 2-norms. +* a) The elements WORK(1:N) are used to store partial column +* 2-norms of the matrix A, and may decrease in each computation +* step; initialize to the values of complete columns 2-norms. +* b) The elements WORK(N+1:2*N) are used to store complete column +* 2-norms of the matrix A, they are not changed during the +* computation; initialize the values of complete columns 2-norms. +* + DO J = 1, N + RWORK( J ) = DZNRM2( M, A( 1, J ), 1 ) + RWORK( N+J ) = RWORK( J ) + END DO +* +* ================================================================== +* +* Compute the pivot column index and the maximum column 2-norm +* for the whole original matrix stored in A(1:M,1:N). +* + KP1 = IDAMAX( N, RWORK( 1 ), 1 ) +* +* ==================================================================. +* + IF( DISNAN( MAXC2NRM ) ) THEN +* +* Check if the matrix A contains NaN, set INFO parameter +* to the column number where the first NaN is found and return +* from the routine. +* + K = 0 + INFO = KP1 +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = MAXC2NRM +* +* Array TAU is not set and contains undefined elements. +* + WORK( 1 ) = DCMPLX( LWKOPT ) + RETURN + END IF +* +* =================================================================== +* + IF( MAXC2NRM.EQ.ZERO ) THEN +* +* Check is the matrix A is a zero matrix, set array TAU and +* return from the routine. +* + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO +* + DO J = 1, MINMN + TAU( J ) = CZERO + END DO +* + WORK( 1 ) = DCMPLX( LWKOPT ) + RETURN +* + END IF +* +* =================================================================== +* + HUGEVAL = DLAMCH( 'Overflow' ) +* + IF( MAXC2NRM.GT.HUGEVAL ) THEN +* +* Check if the matrix A contains +Inf or -Inf, set INFO parameter +* to the column number, where the first +/-Inf is found plus N, +* and continue the computation. +* + INFO = N + KP1 +* + END IF +* +* ================================================================== +* +* Quick return if possible for the case when the first +* stopping criterion is satisfied, i.e. KMAX = 0. +* + IF( KMAX.EQ.0 ) THEN + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE + DO J = 1, MINMN + TAU( J ) = CZERO + END DO + WORK( 1 ) = DCMPLX( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* + EPS = DLAMCH('Epsilon') +* +* Adjust ABSTOL +* + IF( ABSTOL.GE.ZERO ) THEN + SAFMIN = DLAMCH('Safe minimum') + ABSTOL = MAX( ABSTOL, TWO*SAFMIN ) + END IF +* +* Adjust RELTOL +* + IF( RELTOL.GE.ZERO ) THEN + RELTOL = MAX( RELTOL, EPS ) + END IF +* +* =================================================================== +* +* JMAX is the maximum index of the column to be factorized, +* which is also limited by the first stopping criterion KMAX. +* + JMAX = MIN( KMAX, MINMN ) +* +* =================================================================== +* +* Quick return if possible for the case when the second or third +* stopping criterion for the whole original matrix is satified, +* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL +* (which is ONE <= RELTOL). +* + IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN +* + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE +* + DO J = 1, MINMN + TAU( J ) = CZERO + END DO +* + WORK( 1 ) = DCMPLX( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* Factorize columns +* ================================================================== +* +* Determine the block size. +* + NBMIN = 2 + NX = 0 +* + IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN +* +* Determine when to cross over from blocked to unblocked code. +* (for N less than NX, unblocked code should be used). +* + NX = MAX( 0, ILAENV( IXOVER, 'ZGEQP3RK', ' ', M, N, -1, -1 ) ) +* + IF( NX.LT.MINMN ) THEN +* +* Determine if workspace is large enough for blocked code. +* + IF( LWORK.LT.LWKOPT ) THEN +* +* Not enough workspace to use optimal block size that +* is currently stored in NB. +* Reduce NB and determine the minimum value of NB. +* + NB = ( LWORK-2*N ) / ( N+1 ) + NBMIN = MAX( 2, ILAENV( INBMIN, 'ZGEQP3RK', ' ', M, N, + $ -1, -1 ) ) +* + END IF + END IF + END IF +* +* ================================================================== +* +* DONE is the boolean flag to rerpresent the case when the +* factorization completed in the block factorization routine, +* before the end of the block. +* + DONE = .FALSE. +* +* J is the column index. +* + J = 1 +* +* (1) Use blocked code initially. +* +* JMAXB is the maximum column index of the block, when the +* blocked code is used, is also limited by the first stopping +* criterion KMAX. +* + JMAXB = MIN( KMAX, MINMN - NX ) +* + IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN +* +* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: +* J is the column index of a column block; +* JB is the column block size to pass to block factorization +* routine in a loop step; +* JBF is the number of columns that were actually factorized +* that was returned by the block factorization routine +* in a loop step, JBF <= JB; +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + DO WHILE( J.LE.JMAXB ) +* + JB = MIN( NB, JMAXB-J+1 ) + N_SUB = N-J+1 + IOFFSET = J-1 +* +* Factorize JB columns among the columns A(J:N). +* + CALL ZLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ DONE, JBF, MAXC2NRMK, RELMAXC2NRMK, + $ JPIV( J ), TAU( J ), + $ RWORK( J ), RWORK( N+J ), + $ WORK( 1 ), WORK( JB+1 ), + $ N+NRHS-J+1, IWORK, IINFO ) +* +* Set INFO on the first occurence of Inf. +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + END IF +* + IF( DONE ) THEN +* +* Either the submatrix is zero before the end of the +* column block, or ABSTOL or RELTOL criterion is +* satisfied before the end of the column block, we can +* return from the routine. Perform the following before +* returning: +* a) Set the number of factorized columns K, +* K = IOFFSET + JBF from the last call of blocked +* routine. +* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned +* by the block factorization routine; +* 2) The remaining TAUs are set to ZERO by the +* block factorization routine. +* + K = IOFFSET + JBF +* +* Set INFO on the first occurrence of NaN, NaN takes +* prcedence over Inf. +* + IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* +* Return from the routine. +* + WORK( 1 ) = DCMPLX( LWKOPT ) +* + RETURN +* + END IF +* + J = J + JBF +* + END DO +* + END IF +* +* Use unblocked code to factor the last or only block. +* J = JMAX+1 means we factorized the maximum possible number of +* columns, that is in ELSE clause we need to compute +* the MAXC2NORM and RELMAXC2NORM to return after we processed +* the blocks. +* + IF( J.LE.JMAX ) THEN +* +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + N_SUB = N-J+1 + IOFFSET = J-1 +* + CALL ZLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1, + $ ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ), + $ TAU( J ), RWORK( J ), RWORK( N+J ), + $ WORK( 1 ), IINFO ) +* +* ABSTOL or RELTOL criterion is satisfied when the number of +* the factorized columns KF is smaller then the number +* of columns JMAX-J+1 supplied to be factorized by the +* unblocked routine, we can return from +* the routine. Perform the following before returning: +* a) Set the number of factorized columns K, +* b) MAXC2NRMK and RELMAXC2NRMK are returned by the +* unblocked factorization routine above. +* + K = J - 1 + KF +* +* Set INFO on the first exception occurence. +* +* Set INFO on the first exception occurence of Inf or NaN, +* (NaN takes precedence over Inf). +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* + ELSE +* +* Compute the return values for blocked code. +* +* Set the number of factorized columns if the unblocked routine +* was not called. +* + K = JMAX +* +* If there exits a residual matrix after the blocked code: +* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the +* residual matrix, otherwise set them to ZERO; +* 2) Set TAU(K+1:MINMN) to ZERO. +* + IF( K.LT.MINMN ) THEN + JMAXC2NRM = K + IDAMAX( N-K, RWORK( K+1 ), 1 ) + MAXC2NRMK = RWORK( JMAXC2NRM ) + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + DO J = K + 1, MINMN + TAU( J ) = CZERO + END DO +* + ELSE + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO +* + END IF +* +* END IF( J.LE.JMAX ) THEN +* + END IF +* + WORK( 1 ) = DCMPLX( LWKOPT ) +* + RETURN +* +* End of ZGEQP3RK +* + END diff --git a/lapack-netlib/SRC/zgeqr.f b/lapack-netlib/SRC/zgeqr.f index 20a80d083..7df9c2403 100644 --- a/lapack-netlib/SRC/zgeqr.f +++ b/lapack-netlib/SRC/zgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE ZGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -244,8 +246,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +257,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +272,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +286,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +313,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/zgeqrfp.f b/lapack-netlib/SRC/zgeqrfp.f index 73bcde667..3562de36e 100644 --- a/lapack-netlib/SRC/zgeqrfp.f +++ b/lapack-netlib/SRC/zgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -122,7 +123,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup geqrfp * *> \par Further Details: * ===================== @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL XERBLA, ZGEQR2P, ZLARFB, ZLARFT @@ -181,8 +182,16 @@ * INFO = 0 NB = ILAENV( 1, 'ZGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF WORK( 1 ) = LWKOPT +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -190,7 +199,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -202,7 +211,6 @@ * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -210,7 +218,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. diff --git a/lapack-netlib/SRC/zgesvj.f b/lapack-netlib/SRC/zgesvj.f index 6cd2335f2..2be45d826 100644 --- a/lapack-netlib/SRC/zgesvj.f +++ b/lapack-netlib/SRC/zgesvj.f @@ -200,23 +200,25 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of the array V, LDV >= 1. -*> If JOBV = 'V', then LDV >= max(1,N). -*> If JOBV = 'A', then LDV >= max(1,MV) . +*> If JOBV = 'V', then LDV >= MAX(1,N). +*> If JOBV = 'A', then LDV >= MAX(1,MV) . *> \endverbatim *> *> \param[in,out] CWORK *> \verbatim -*> CWORK is COMPLEX*16 array, dimension (max(1,LWORK)) +*> CWORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) *> Used as workspace. -*> If on entry LWORK = -1, then a workspace query is assumed and -*> no computation is done; CWORK(1) is set to the minial (and optimal) -*> length of CWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER. -*> Length of CWORK, LWORK >= M+N. +*> Length of CWORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M+N, otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; CWORK(1) is set to the minial (and optimal) +*> length of CWORK. *> \endverbatim *> *> \param[in,out] RWORK @@ -247,15 +249,17 @@ *> RWORK(6) = the largest absolute value over all sines of the *> Jacobi rotation angles in the last sweep. It can be *> useful for a post festum analysis. -*> If on entry LRWORK = -1, then a workspace query is assumed and -*> no computation is done; RWORK(1) is set to the minial (and optimal) -*> length of RWORK. *> \endverbatim *> *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> Length of RWORK, LRWORK >= MAX(6,N). +*> Length of RWORK. +*> LRWORK >= 1, if MIN(M,N) = 0, and LRWORK >= MAX(6,N), otherwise. +*> +*> If on entry LRWORK = -1, then a workspace query is assumed and +*> no computation is done; RWORK(1) is set to the minial (and optimal) +*> length of RWORK. *> \endverbatim *> *> \param[out] INFO @@ -276,7 +280,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -367,23 +371,25 @@ * * .. Local Parameters .. DOUBLE PRECISION ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0) - COMPLEX*16 CZERO, CONE - PARAMETER ( CZERO = (0.0D0, 0.0D0), CONE = (1.0D0, 0.0D0) ) - INTEGER NSWEEP - PARAMETER ( NSWEEP = 30 ) + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0) + COMPLEX*16 CZERO, CONE + PARAMETER ( CZERO = (0.0D0, 0.0D0), CONE = (1.0D0, 0.0D0) ) + INTEGER NSWEEP + PARAMETER ( NSWEEP = 30 ) * .. * .. Local Scalars .. - COMPLEX*16 AAPQ, OMPQ - DOUBLE PRECISION AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, - $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, - $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, - $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL - INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, - $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, - $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + COMPLEX*16 AAPQ, OMPQ + DOUBLE PRECISION AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, + $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, + $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, + $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, + $ TOL + INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, + $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, + $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, + $ SWBAND, MINMN, LWMIN, LRWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, + $ ROTOK, RSVEC, UCTOL, UPPER * .. * .. * .. Intrinsic Functions .. @@ -422,7 +428,16 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * - LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 ) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + LRWMIN = 1 + ELSE + LWMIN = M+N + LRWMIN = MAX( 6, N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -442,9 +457,9 @@ INFO = -11 ELSE IF( UCTOL .AND. ( RWORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( ( LWORK.LT.( M+N ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 - ELSE IF( ( LRWORK.LT.MAX( N, 6 ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LRWORK.LT.LRWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -15 ELSE INFO = 0 @@ -454,15 +469,15 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZGESVJ', -INFO ) RETURN - ELSE IF ( LQUERY ) THEN - CWORK(1) = M + N - RWORK(1) = MAX( N, 6 ) + ELSE IF( LQUERY ) THEN + CWORK( 1 ) = LWMIN + RWORK( 1 ) = LRWMIN RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/zgetri.f b/lapack-netlib/SRC/zgetri.f index 41782841c..f3806a77c 100644 --- a/lapack-netlib/SRC/zgetri.f +++ b/lapack-netlib/SRC/zgetri.f @@ -107,7 +107,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup getri * * ===================================================================== SUBROUTINE ZGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -152,7 +152,7 @@ * INFO = 0 NB = ILAENV( 1, 'ZGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zgetsls.f b/lapack-netlib/SRC/zgetsls.f index 17c6d5146..26311c611 100644 --- a/lapack-netlib/SRC/zgetsls.f +++ b/lapack-netlib/SRC/zgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEsolve +*> \ingroup getsls * * ===================================================================== SUBROUTINE ZGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, @@ -192,7 +192,7 @@ * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH, ZLANGE - EXTERNAL LSAME, DLABAD, DLAMCH, ZLANGE + EXTERNAL LSAME, DLAMCH, ZLANGE * .. * .. External Subroutines .. EXTERNAL ZGEQR, ZGEMQR, ZLASCL, ZLASET, @@ -229,7 +229,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEO = 1 + WSIZEM = 1 + ELSE IF( M.GE.N ) THEN CALL ZGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) @@ -297,7 +300,6 @@ * SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * diff --git a/lapack-netlib/SRC/zgetsqrhrt.f b/lapack-netlib/SRC/zgetsqrhrt.f index 5f0167937..e7ce993aa 100644 --- a/lapack-netlib/SRC/zgetsqrhrt.f +++ b/lapack-netlib/SRC/zgetsqrhrt.f @@ -131,13 +131,15 @@ *> \param[in] LWORK *> \verbatim *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -160,7 +162,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup comlpex16OTHERcomputational +*> \ingroup getsqrhrt * *> \par Contributors: * ================== @@ -212,7 +214,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -225,7 +227,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -263,8 +265,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * diff --git a/lapack-netlib/SRC/zgges3.f b/lapack-netlib/SRC/zgges3.f index 8b3e44f88..8235c2543 100644 --- a/lapack-netlib/SRC/zgges3.f +++ b/lapack-netlib/SRC/zgges3.f @@ -215,7 +215,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N) +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -260,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEeigen +*> \ingroup gges3 * * ===================================================================== SUBROUTINE ZGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, @@ -300,7 +301,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT + $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT, + $ LWKMIN DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SMLNUM * .. @@ -309,9 +311,8 @@ DOUBLE PRECISION DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL DLABAD, XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, - $ ZLAQZ0, ZLACPY, ZLASCL, ZLASET, ZTGSEN, ZUNGQR, - $ ZUNMQR + EXTERNAL XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, ZLAQZ0, + $ ZLACPY, ZLASCL, ZLASET, ZTGSEN, ZUNGQR, ZUNMQR * .. * .. External Functions .. LOGICAL LSAME @@ -353,6 +354,8 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -369,7 +372,7 @@ INFO = -14 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -16 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * @@ -377,28 +380,32 @@ * IF( INFO.EQ.0 ) THEN CALL ZGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N + INT( WORK( 1 ) ) ) CALL ZUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) IF( ILVSL ) THEN CALL ZUNGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, IERR ) LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) END IF CALL ZGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL, $ LDVSL, VSR, LDVSR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) CALL ZLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, WORK, -1, $ RWORK, 0, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) IF( WANTST ) THEN CALL ZTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, SDIM, $ PVSL, PVSR, DIF, WORK, -1, IDUM, 1, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = DCMPLX( LWKOPT ) END IF - WORK( 1 ) = DCMPLX( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -420,7 +427,6 @@ EPS = DLAMCH( 'P' ) SMLNUM = DLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/zggev3.f b/lapack-netlib/SRC/zggev3.f index 2d6c74582..0cc073470 100644 --- a/lapack-netlib/SRC/zggev3.f +++ b/lapack-netlib/SRC/zggev3.f @@ -174,7 +174,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -208,7 +209,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEeigen +*> \ingroup ggev3 * * ===================================================================== SUBROUTINE ZGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA, @@ -243,7 +244,7 @@ CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, $ IN, IRIGHT, IROWS, IRWRK, ITAU, IWRK, JC, JR, - $ LWKOPT + $ LWKMIN, LWKOPT DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP COMPLEX*16 X @@ -252,9 +253,8 @@ LOGICAL LDUMMA( 1 ) * .. * .. External Subroutines .. - EXTERNAL DLABAD, XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, - $ ZLAQZ0, ZLACPY, ZLASCL, ZLASET, ZTGEVC, ZUNGQR, - $ ZUNMQR + EXTERNAL XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, ZLAQZ0, + $ ZLACPY, ZLASCL, ZLASET, ZTGEVC, ZUNGQR, ZUNMQR * .. * .. External Functions .. LOGICAL LSAME @@ -301,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -315,7 +316,7 @@ INFO = -11 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -15 END IF * @@ -323,7 +324,7 @@ * IF( INFO.EQ.0 ) THEN CALL ZGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N+INT( WORK( 1 ) ) ) CALL ZUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) @@ -348,7 +349,11 @@ $ RWORK, 0, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) END IF - WORK( 1 ) = DCMPLX( LWKOPT ) + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = DCMPLX( LWKOPT ) + END IF END IF * IF( INFO.NE.0 ) THEN @@ -368,7 +373,6 @@ EPS = DLAMCH( 'E' )*DLAMCH( 'B' ) SMLNUM = DLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/zgghd3.f b/lapack-netlib/SRC/zgghd3.f index b29cdc70a..08343688d 100644 --- a/lapack-netlib/SRC/zgghd3.f +++ b/lapack-netlib/SRC/zgghd3.f @@ -176,14 +176,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -208,7 +208,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup gghd3 * *> \par Further Details: * ===================== @@ -275,7 +275,12 @@ * INFO = 0 NB = ILAENV( 1, 'ZGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF WORK( 1 ) = DCMPLX( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) @@ -325,7 +330,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = CONE RETURN @@ -883,6 +887,7 @@ IF ( JCOL.LT.IHI ) $ CALL ZGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) +* WORK( 1 ) = DCMPLX( LWKOPT ) * RETURN diff --git a/lapack-netlib/SRC/zggqrf.f b/lapack-netlib/SRC/zggqrf.f index 0388b0874..d8636d663 100644 --- a/lapack-netlib/SRC/zggqrf.f +++ b/lapack-netlib/SRC/zggqrf.f @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup ggqrf * *> \par Further Details: * ===================== @@ -250,7 +250,7 @@ NB2 = ILAENV( 1, 'ZGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'ZUNMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zggrqf.f b/lapack-netlib/SRC/zggrqf.f index be912c772..69c14af24 100644 --- a/lapack-netlib/SRC/zggrqf.f +++ b/lapack-netlib/SRC/zggrqf.f @@ -172,7 +172,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup ggrqf * *> \par Further Details: * ===================== @@ -249,7 +249,7 @@ NB2 = ILAENV( 1, 'ZGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'ZUNMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zggsvd3.f b/lapack-netlib/SRC/zggsvd3.f index 71257a7c0..40624f5be 100644 --- a/lapack-netlib/SRC/zggsvd3.f +++ b/lapack-netlib/SRC/zggsvd3.f @@ -277,7 +277,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -332,7 +332,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEsing +*> \ingroup ggsvd3 * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/zggsvp3.f b/lapack-netlib/SRC/zggsvp3.f index f39ccdad3..7b465aaee 100644 --- a/lapack-netlib/SRC/zggsvp3.f +++ b/lapack-netlib/SRC/zggsvp3.f @@ -233,7 +233,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -256,7 +256,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup ggsvp3 * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/zheevd.f b/lapack-netlib/SRC/zheevd.f index ba52f9e72..8e86b9e88 100644 --- a/lapack-netlib/SRC/zheevd.f +++ b/lapack-netlib/SRC/zheevd.f @@ -116,8 +116,7 @@ *> *> \param[out] RWORK *> \verbatim -*> RWORK is DOUBLE PRECISION array, -*> dimension (LRWORK) +*> RWORK is DOUBLE PRECISION array, dimension (MAX(1,LRWORK)) *> On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. *> \endverbatim *> @@ -180,7 +179,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEeigen +*> \ingroup heevd * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/zheevr.f b/lapack-netlib/SRC/zheevr.f index 1452e04a3..fe6e1a85f 100644 --- a/lapack-netlib/SRC/zheevr.f +++ b/lapack-netlib/SRC/zheevr.f @@ -272,7 +272,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,2*N). +*> The length of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 2*N. *> For optimal efficiency, LWORK >= (NB+1)*N, *> where NB is the max of the blocksize for ZHETRD and for *> ZUNMTR as returned by ILAENV. @@ -294,7 +295,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -313,7 +315,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -338,7 +341,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEeigen +*> \ingroup heevr * *> \par Contributors: * ================== @@ -417,9 +420,15 @@ LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR. $ ( LIWORK.EQ.-1 ) ) * - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) - LWMIN = MAX( 1, 2*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 2*N + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -454,7 +463,7 @@ NB = ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'ZUNMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKOPT RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * @@ -483,7 +492,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = DBLE( A( 1, 1 ) ) @@ -710,7 +719,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKOPT RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * diff --git a/lapack-netlib/SRC/zheevr_2stage.f b/lapack-netlib/SRC/zheevr_2stage.f index 5c576e633..b1cc7175f 100644 --- a/lapack-netlib/SRC/zheevr_2stage.f +++ b/lapack-netlib/SRC/zheevr_2stage.f @@ -265,7 +265,7 @@ *> indicating the nonzero elements in Z. The i-th eigenvector *> is nonzero only in elements ISUPPZ( 2*i-1 ) through *> ISUPPZ( 2*i ). This is an output of ZSTEMR (tridiagonal -*> matrix). The support of the eigenvectors of A is typically +*> matrix). The support of the eigenvectors of A is typically *> 1:N because of the unitary transformations applied by ZUNMTR. *> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 *> \endverbatim @@ -279,12 +279,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -310,7 +311,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -329,7 +331,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -354,7 +357,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEeigen +*> \ingroup heevr_2stage * *> \par Contributors: * ================== @@ -382,7 +385,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -390,11 +393,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -472,9 +475,16 @@ IB = ILAENV2STAGE( 2, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = N + LHTRD + LWTRD - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = N + LHTRD + LWTRD + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -535,7 +545,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = DBLE( A( 1, 1 ) ) @@ -643,9 +653,9 @@ * * Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form. * - CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), + CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), $ RWORK( INDRE ), WORK( INDTAU ), - $ WORK( INDHOUS ), LHTRD, + $ WORK( INDHOUS ), LHTRD, $ WORK( INDWK ), LLWORK, IINFO ) * * If all eigenvalues are desired diff --git a/lapack-netlib/SRC/zhesv_aa.f b/lapack-netlib/SRC/zhesv_aa.f index df8498c7a..b3d4b3725 100644 --- a/lapack-netlib/SRC/zhesv_aa.f +++ b/lapack-netlib/SRC/zhesv_aa.f @@ -128,7 +128,7 @@ *> LWORK is INTEGER *> The length of WORK. LWORK >= MAX(1,2*N,3*N-2), and for best *> performance LWORK >= max(1,N*NB), where NB is the optimal -*> blocksize for ZHETRF. +*> blocksize for ZHETRF_AA. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEsolve +*> \ingroup hesv_aa * * ===================================================================== SUBROUTINE ZHESV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -177,7 +177,7 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS + INTEGER LWKMIN, LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS * .. * .. External Functions .. LOGICAL LSAME @@ -196,6 +196,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -206,17 +207,17 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_HETRF = INT( WORK(1) ) + LWKOPT_HETRF = INT( WORK( 1 ) ) CALL ZHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_HETRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS ) + LWKOPT_HETRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_HETRF, LWKOPT_HETRS ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/zhesv_aa_2stage.f b/lapack-netlib/SRC/zhesv_aa_2stage.f index 79c01c546..c503b5554 100644 --- a/lapack-netlib/SRC/zhesv_aa_2stage.f +++ b/lapack-netlib/SRC/zhesv_aa_2stage.f @@ -100,14 +100,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX*16 array, dimension (LTB) +*> TB is COMPLEX*16 array, dimension (MAX(1,LTB)). *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -147,14 +147,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 workspace of size LWORK +*> WORK is COMPLEX*16 workspace of size (MAX(1,LWORK)). +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -178,7 +179,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEsolve +*> \ingroup hesv_aa_2stage * * ===================================================================== SUBROUTINE ZHESV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, @@ -208,7 +209,7 @@ * * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKOPT, LWKMIN * .. * .. External Functions .. LOGICAL LSAME @@ -229,6 +230,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -237,18 +239,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zhesvx.f b/lapack-netlib/SRC/zhesvx.f index 485c81df6..64aa16674 100644 --- a/lapack-netlib/SRC/zhesvx.f +++ b/lapack-netlib/SRC/zhesvx.f @@ -234,8 +234,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= max(1,2*N), and for best -*> performance, when FACT = 'N', LWORK >= max(1,2*N,N*NB), where +*> The length of WORK. LWORK >= MAX(1,2*N), and for best +*> performance, when FACT = 'N', LWORK >= MAX(1,2*N,N*NB), where *> NB is the optimal blocksize for ZHETRF. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -276,7 +276,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEsolve +*> \ingroup hesvx * * ===================================================================== SUBROUTINE ZHESVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B, @@ -307,7 +307,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKOPT, LWKMIN, NB DOUBLE PRECISION ANORM * .. * .. External Functions .. @@ -329,6 +329,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -346,12 +347,12 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 2*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) diff --git a/lapack-netlib/SRC/zhetrd_2stage.f b/lapack-netlib/SRC/zhetrd_2stage.f index b9d2f0eb1..ab444894b 100644 --- a/lapack-netlib/SRC/zhetrd_2stage.f +++ b/lapack-netlib/SRC/zhetrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download ZHETRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download ZHETRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * COMPLEX*16 A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the unitary *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is COMPLEX*16 array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is COMPLEX*16 array, dimension (LHOUS2) +*> HOUS2 is COMPLEX*16 array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -143,23 +145,26 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +182,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +207,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +215,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -265,10 +270,13 @@ * KD = ILAENV2STAGE( 1, 'ZHETRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'ZHETRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'ZHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -309,14 +317,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRD_HE2HB', -INFO ) RETURN END IF - CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRD_HB2ST', -INFO ) @@ -324,7 +332,6 @@ END IF * * - HOUS2( 1 ) = LHMIN WORK( 1 ) = LWMIN RETURN * diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F index 1d39ac942..247497ab6 100644 --- a/lapack-netlib/SRC/zhetrd_hb2st.F +++ b/lapack-netlib/SRC/zhetrd_hb2st.F @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, +* SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, * D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * #if defined(_OPENMP) @@ -53,12 +53,12 @@ *> \param[in] STAGE1 *> \verbatim *> STAGE1 is CHARACTER*1 -*> = 'N': "No": to mention that the stage 1 of the reduction +*> = 'N': "No": to mention that the stage 1 of the reduction *> from dense to band using the zhetrd_he2hb routine -*> was not called before this routine to reproduce AB. -*> In other term this routine is called as standalone. -*> = 'Y': "Yes": to mention that the stage 1 of the -*> reduction from dense to band using the zhetrd_he2hb +*> was not called before this routine to reproduce AB. +*> In other term this routine is called as standalone. +*> = 'Y': "Yes": to mention that the stage 1 of the +*> reduction from dense to band using the zhetrd_he2hb *> routine has been called to produce AB (e.g., AB is *> the output of zhetrd_he2hb. *> \endverbatim @@ -66,10 +66,10 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> and thus LHOUS is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate or to apply Q later on, +*> = 'V': the Householder representation is needed to +*> either generate or to apply Q later on, *> then LHOUS is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -132,34 +132,39 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is COMPLEX*16 array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is COMPLEX*16 array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error *> message related to LHOUS is issued by XERBLA. *> LHOUS = MAX(1, dimension) where *> dimension = 4*N if VECT='N' -*> not available now if VECT='H' +*> not available now if VECT='H' *> \endverbatim *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension LWORK. +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)). +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -188,7 +193,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup hetrd_hb2st * *> \par Further Details: * ===================== @@ -208,7 +213,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -216,16 +221,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, + SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * @@ -259,11 +264,11 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 - INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, + INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, - $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, + $ NBTILES, TTYPE, TID, NTHREADS, + $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SIZEV, SIZETAU, LDV, LHMIN, LWMIN DOUBLE PRECISION ABSTMP @@ -277,7 +282,7 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV2STAGE + INTEGER ILAENV2STAGE EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. @@ -285,7 +290,6 @@ * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -294,9 +298,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -358,7 +367,7 @@ ABDPOS = KD + 1 ABOFDPOS = KD ELSE - APOS = INDA + APOS = INDA AWPOS = INDA + KD + 1 DPOS = APOS OFDPOS = DPOS + 1 @@ -366,11 +375,11 @@ ABOFDPOS = 2 ENDIF -* -* Case KD=0: -* The matrix is diagonal. We just copy it (convert to "real" for -* complex because D is double and the imaginary part should be 0) -* and store it in D. A sequential code here is better or +* +* Case KD=0: +* The matrix is diagonal. We just copy it (convert to "real" for +* complex because D is double and the imaginary part should be 0) +* and store it in D. A sequential code here is better or * in a parallel environment it might need two cores for D and E * IF( KD.EQ.0 ) THEN @@ -385,17 +394,17 @@ WORK( 1 ) = 1 RETURN END IF -* -* Case KD=1: -* The matrix is already Tridiagonal. We have to make diagonal +* +* Case KD=1: +* The matrix is already Tridiagonal. We have to make diagonal * and offdiagonal elements real, and store them in D and E. -* For that, for real precision just copy the diag and offdiag -* to D and E while for the COMPLEX case the bulge chasing is -* performed to convert the hermetian tridiagonal to symmetric -* tridiagonal. A simpler conversion formula might be used, but then +* For that, for real precision just copy the diag and offdiag +* to D and E while for the COMPLEX case the bulge chasing is +* performed to convert the hermetian tridiagonal to symmetric +* tridiagonal. A simpler conversion formula might be used, but then * updating the Q matrix will be required and based if Q is generated -* or not this might complicate the story. -* +* or not this might complicate the story. +* IF( KD.EQ.1 ) THEN DO 50 I = 1, N D( I ) = DBLE( AB( ABDPOS, I ) ) @@ -444,7 +453,7 @@ C END IF RETURN END IF * -* Main code start here. +* Main code start here. * Reduce the hermitian band of A to a tridiagonal matrix. * THGRSIZ = N @@ -453,7 +462,7 @@ C END IF NBTILES = CEILING( REAL(N)/REAL(KD) ) STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) -* +* CALL ZLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) CALL ZLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) * @@ -462,7 +471,7 @@ C END IF * #if defined(_OPENMP) !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) -!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) +!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) !$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) !$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) !$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) @@ -471,7 +480,7 @@ C END IF #endif * * main bulge chasing loop -* +* DO 100 THGRID = 1, THGRNB STT = (THGRID-1)*THGRSIZ+1 THED = MIN( (STT + THGRSIZ -1), (N-1)) @@ -482,7 +491,7 @@ C END IF ST = STT DO 130 SWEEPID = ST, ED DO 140 K = 1, GRSIZ - MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) + MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) $ + (M-1)*GRSIZ + K IF ( MYID.EQ.1 ) THEN TTYPE = 1 @@ -508,17 +517,17 @@ C END IF ENDIF * * Call the kernel -* +* #if defined(_OPENMP) && _OPENMP >= 201307 - IF( TTYPE.NE.1 ) THEN + IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK @@ -526,20 +535,20 @@ C END IF !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK ENDIF #else - CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW ) ) -#endif +#endif IF ( BLKLASTIND.GE.(N-1) ) THEN STT = STT + 1 EXIT @@ -554,14 +563,14 @@ C END IF !$OMP END MASTER !$OMP END PARALLEL #endif -* +* * Copy the diagonal from A to D. Note that D is REAL thus only * the Real part is needed, the imaginary part should be zero. * DO 150 I = 1, N D( I ) = DBLE( WORK( DPOS+(I-1)*LDA ) ) 150 CONTINUE -* +* * Copy the off diagonal from A to E. Note that E is REAL thus only * the Real part is needed, the imaginary part should be zero. * @@ -575,11 +584,10 @@ C END IF 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN WORK( 1 ) = LWMIN RETURN * * End of ZHETRD_HB2ST * END - + diff --git a/lapack-netlib/SRC/zhetrd_he2hb.f b/lapack-netlib/SRC/zhetrd_he2hb.f index e1b2e1794..3e3bfa374 100644 --- a/lapack-netlib/SRC/zhetrd_he2hb.f +++ b/lapack-netlib/SRC/zhetrd_he2hb.f @@ -123,8 +123,8 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY). +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -158,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrd_he2hb * *> \par Further Details: * ===================== @@ -293,8 +295,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zhetrf.f b/lapack-netlib/SRC/zhetrf.f index 78d4f71b8..433887108 100644 --- a/lapack-netlib/SRC/zhetrf.f +++ b/lapack-netlib/SRC/zhetrf.f @@ -107,7 +107,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> \endverbatim *> @@ -130,7 +130,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf * *> \par Further Details: * ===================== @@ -227,7 +227,7 @@ * Determine the block size * NB = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * @@ -346,6 +346,7 @@ END IF * 40 CONTINUE +* WORK( 1 ) = LWKOPT RETURN * diff --git a/lapack-netlib/SRC/zhetrf_aa.f b/lapack-netlib/SRC/zhetrf_aa.f index 537c16e8c..381c87d51 100644 --- a/lapack-netlib/SRC/zhetrf_aa.f +++ b/lapack-netlib/SRC/zhetrf_aa.f @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= MAX(1,2*N). For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N >= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -125,10 +127,10 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -152,7 +154,7 @@ * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB COMPLEX*16 ALPHA * .. @@ -178,18 +180,25 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N WORK( 1 ) = LWKOPT END IF * @@ -202,11 +211,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN A( 1, 1 ) = DBLE( A( 1, 1 ) ) RETURN END IF diff --git a/lapack-netlib/SRC/zhetrf_aa_2stage.f b/lapack-netlib/SRC/zhetrf_aa_2stage.f index 477602b5e..bab13a99d 100644 --- a/lapack-netlib/SRC/zhetrf_aa_2stage.f +++ b/lapack-netlib/SRC/zhetrf_aa_2stage.f @@ -87,14 +87,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX*16 array, dimension (LTB) +*> TB is COMPLEX*16 array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 workspace of size LWORK +*> WORK is COMPLEX*16 workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16SYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -182,7 +182,7 @@ * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY INTEGER I, J, K, I1, I2, TD - INTEGER LDTB, NB, KB, JB, NT, IINFO + INTEGER LWKOPT, LDTB, NB, KB, JB, NT, IINFO COMPLEX*16 PIV * .. * .. External Functions .. @@ -212,9 +212,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -228,10 +228,10 @@ NB = ILAENV( 1, 'ZHETRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = MAX( 1, (3*NB+1)*N ) END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = MAX( 1, N*NB ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -240,7 +240,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * @@ -392,7 +392,7 @@ CALL ZGETRF( N-(J+1)*NB, NB, $ WORK, N, $ IPIV( (J+1)*NB+1 ), IINFO ) -c IF (IINFO.NE.0 .AND. INFO.EQ.0) THEN +c IF( IINFO.NE.0 .AND. INFO.EQ.0 ) THEN c INFO = IINFO+(J+1)*NB c END IF * @@ -587,7 +587,7 @@ c END IF CALL ZGETRF( N-(J+1)*NB, NB, $ A( (J+1)*NB+1, J*NB+1 ), LDA, $ IPIV( (J+1)*NB+1 ), IINFO ) -c IF (IINFO.NE.0 .AND. INFO.EQ.0) THEN +c IF( IINFO.NE.0 .AND. INFO.EQ.0 ) THEN c INFO = IINFO+(J+1)*NB c END IF * diff --git a/lapack-netlib/SRC/zhetrf_rk.f b/lapack-netlib/SRC/zhetrf_rk.f index 73dd9f9d0..7c505fa4d 100644 --- a/lapack-netlib/SRC/zhetrf_rk.f +++ b/lapack-netlib/SRC/zhetrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension ( MAX(1,LWORK) ). +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -229,7 +229,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf_rk * *> \par Further Details: * ===================== @@ -310,7 +310,7 @@ * Determine the block size * NB = ILAENV( 1, 'ZHETRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/zhetrf_rook.f b/lapack-netlib/SRC/zhetrf_rook.f index e9de47248..a56349092 100644 --- a/lapack-netlib/SRC/zhetrf_rook.f +++ b/lapack-netlib/SRC/zhetrf_rook.f @@ -122,7 +122,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -150,7 +150,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf_rook * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/zhetri2.f b/lapack-netlib/SRC/zhetri2.f index 384745c3a..1d932b866 100644 --- a/lapack-netlib/SRC/zhetri2.f +++ b/lapack-netlib/SRC/zhetri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (N+NB+1)*(NB+3) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)). *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE ZHETRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -159,9 +159,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +177,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = MINSIZE RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - IF( NBMAX .GE. N ) THEN + IF( NBMAX.GE.N ) THEN CALL ZHETRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL ZHETRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of ZHETRI2 diff --git a/lapack-netlib/SRC/zhetrs_aa.f b/lapack-netlib/SRC/zhetrs_aa.f index 06ac1fd28..b7a1f7f07 100644 --- a/lapack-netlib/SRC/zhetrs_aa.f +++ b/lapack-netlib/SRC/zhetrs_aa.f @@ -106,7 +106,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -124,7 +130,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrs_aa * * ===================================================================== SUBROUTINE ZHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, @@ -152,7 +158,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME @@ -162,13 +168,19 @@ EXTERNAL ZGTSV, ZSWAP, ZTRSM, ZLACGV, ZLACPY, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -179,21 +191,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKMIN RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN diff --git a/lapack-netlib/SRC/zlamswlq.f b/lapack-netlib/SRC/zlamswlq.f index 4abefa434..59a0a5558 100644 --- a/lapack-netlib/SRC/zlamswlq.f +++ b/lapack-netlib/SRC/zlamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -189,92 +192,103 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamswlq +*> * ===================================================================== SUBROUTINE ZLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX*16 A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. External Subroutines .. - EXTERNAL ZTPMLQT, ZGEMLQT, XERBLA + EXTERNAL ZTPMLQT, ZGEMLQT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLAMSWLQ', -INFO ) - WORK(1) = LW RETURN - ELSE IF (LQUERY) THEN - WORK(1) = LW + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * IF((NB.LE.K).OR.(NB.GE.MAX(M,N,K))) THEN CALL ZGEMLQT( SIDE, TRANS, M, N, K, MB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN END IF * @@ -403,7 +417,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN RETURN * * End of ZLAMSWLQ diff --git a/lapack-netlib/SRC/zlamtsqr.f b/lapack-netlib/SRC/zlamtsqr.f index 5030cb75f..03770c06e 100644 --- a/lapack-netlib/SRC/zlamtsqr.f +++ b/lapack-netlib/SRC/zlamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -191,46 +193,50 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamtsqr +*> * ===================================================================== SUBROUTINE ZLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX*16 A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. External Subroutines .. - EXTERNAL ZGEMQRT, ZTPMQRT, XERBLA + EXTERNAL ZGEMQRT, ZTPMQRT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -238,11 +244,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -253,38 +265,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = LW + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL ZGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -410,7 +422,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN RETURN * * End of ZLAMTSQR diff --git a/lapack-netlib/SRC/zlaqp2rk.c b/lapack-netlib/SRC/zlaqp2rk.c new file mode 100644 index 000000000..0d38e71fb --- /dev/null +++ b/lapack-netlib/SRC/zlaqp2rk.c @@ -0,0 +1,947 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + *info = *n + kk - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL >= ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL >= ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + +/* Set K, the number of factorized columns. */ + + *k = kk - 1; + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. */ + + i__2 = minmnfact; + for (j = kk; j <= i__2; ++j) { + i__3 = j; + tau[i__3].r = 0., tau[i__3].i = 0.; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,KK:N): */ +/* 1) swap the KK-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) copy the KK-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than KK in the next loop step.) */ +/* 3) Save the pivot interchange with the indices relative to the */ +/* the original matrix A, not the block A(1:M,1:N). */ + + if (kp != kk) { + zswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1); + vn1[kp] = vn1[kk]; + vn2[kp] = vn2[kk]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[kk]; + jpiv[kk] = itemp; + } + +/* Generate elementary reflector H(KK) using the column A(I:M,KK), */ +/* if the column has more than one element, otherwise */ +/* the elementary reflector would be an identity matrix, */ +/* and TAU(KK) = CZERO. */ + + if (i__ < *m) { + i__2 = *m - i__ + 1; + zlarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], & + c__1, &tau[kk]); + } else { + i__2 = kk; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + +/* Check if TAU(KK) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(KK) for Inf, */ +/* since ZLARFG cannot produce TAU(KK) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by ZLARFG can contain Inf, which requires */ +/* TAU(KK) to contain NaN. Therefore, this case of generating Inf */ +/* by ZLARFG is covered by checking TAU(KK) for NaN. */ + + i__2 = kk; + d__1 = tau[i__2].r; + if (disnan_(&d__1)) { + i__2 = kk; + taunan = tau[i__2].r; + } else /* if(complicated condition) */ { + d__1 = d_imag(&tau[kk]); + if (disnan_(&d__1)) { + taunan = d_imag(&tau[kk]); + } else { + taunan = 0.; + } + } + + if (disnan_(&taunan)) { + *k = kk - 1; + *info = kk; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = taunan; + *relmaxc2nrmk = taunan; + +/* Array TAU(KK:MINMNFACT) is not set and contains */ +/* undefined elements, except the first element TAU(KK) = NaN. */ + + return 0; + } + +/* Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left. */ +/* ( If M >= N, then at KK = N there is no residual matrix, */ +/* i.e. no columns of A to update, only columns of B. */ +/* If M < N, then at KK = M-IOFFSET, I = M and we have a */ +/* one-row residual matrix in A and the elementary */ +/* reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update */ +/* is needed for the residual matrix in A and the */ +/* right-hand-side-matrix in B. */ +/* Therefore, we update only if */ +/* KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */ +/* condition is satisfied, not only KK < N+NRHS ) */ + + if (kk < minmnupdt) { + i__2 = i__ + kk * a_dim1; + aikk.r = a[i__2].r, aikk.i = a[i__2].i; + i__2 = i__ + kk * a_dim1; + a[i__2].r = 1., a[i__2].i = 0.; + i__2 = *m - i__ + 1; + i__3 = *n + *nrhs - kk; + d_cnjg(&z__1, &tau[kk]); + zlarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &z__1, + &a[i__ + (kk + 1) * a_dim1], lda, &work[1]); + i__2 = i__ + kk * a_dim1; + a[i__2].r = aikk.r, a[i__2].i = aikk.i; + } + + if (kk < minmnfact) { + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */ +/* when KK < f2cmin(M-IOFFSET, N). */ + + i__2 = *n; + for (j = kk + 1; j <= i__2; ++j) { + if (vn1[j] != 0.) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + +/* Computing 2nd power */ + d__1 = z_abs(&a[i__ + j * a_dim1]) / vn1[j]; + temp = 1. - d__1 * d__1; + temp = f2cmax(temp,0.); +/* Computing 2nd power */ + d__1 = vn1[j] / vn2[j]; + temp2 = temp * (d__1 * d__1); + if (temp2 <= tol3z) { + +/* Compute the column 2-norm for the partial */ +/* column A(I+1:M,J) by explicitly computing it, */ +/* and store it in both partial 2-norm vector VN1 */ +/* and exact column 2-norm vector VN2. */ + + i__3 = *m - i__; + vn1[j] = dznrm2_(&i__3, &a[i__ + 1 + j * a_dim1], & + c__1); + vn2[j] = vn1[j]; + + } else { + +/* Update the column 2-norm for the partial */ +/* column A(I+1:M,J) by removing one */ +/* element A(I,J) and store it in partial */ +/* 2-norm vector VN1. */ + + vn1[j] *= sqrt(temp); + + } + } + } + + } + +/* End factorization loop */ + + } + +/* If we reached this point, all colunms have been factorized, */ +/* i.e. no condition was triggered to exit the routine. */ +/* Set the number of factorized columns. */ + + *k = *kmax; + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */ +/* we return. */ + + if (*k < minmnfact) { + + i__1 = *n - *k; + jmaxc2nrm = *k + idamax_(&i__1, &vn1[*k + 1], &c__1); + *maxc2nrmk = vn1[jmaxc2nrm]; + + if (*k == 0) { + *relmaxc2nrmk = 1.; + } else { + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + } + + } else { + *maxc2nrmk = 0.; + *relmaxc2nrmk = 0.; + } + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, set TAUs corresponding to the columns that were */ +/* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO. */ + + i__1 = minmnfact; + for (j = *k + 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + + return 0; + +/* End of ZLAQP2RK */ + +} /* zlaqp2rk_ */ + diff --git a/lapack-netlib/SRC/zlaqp2rk.f b/lapack-netlib/SRC/zlaqp2rk.f new file mode 100644 index 000000000..f6bf555c2 --- /dev/null +++ b/lapack-netlib/SRC/zlaqp2rk.f @@ -0,0 +1,726 @@ +*> \brief \b ZLAQP2RK computes truncated QR factorization with column pivoting of a complex matrix block using Level 2 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZLAQP2RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, +* $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, +* $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, +* $ INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS +* DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER JPIV( * ) +* DOUBLE PRECISION VN1( * ), VN2( * ) +* COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) +* $ +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLAQP2RK computes a truncated (rank K) or full rank Householder QR +*> factorization with column pivoting of the complex matrix +*> block A(IOFFSET+1:M,1:N) as +*> +*> A * P(K) = Q(K) * R(K). +*> +*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N) +*> is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides matrix block B +*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**H * B. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M-IOFFSET,N), then this stopping +*> criterion is not used, factorize columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The second factorization stopping criterion. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The third factorization stopping criterion. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine ZGEQP3RK. 1 <= KP1 <= N_orig_mat. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is DOUBLE PRECISION +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine ZGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:K) below +*> the diagonal together with the array TAU represent +*> the unitary matrix Q(K) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:K) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS). +*> The left part A(IOFFSET+1:M,K+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(K)**H. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX*16 array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is DOUBLE PRECISION array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is DOUBLE PRECISION array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (N-1) +*> Used in ZLARF subroutine to apply an elementary +*> reflector from the left. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp2rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, + $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, + $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, + $ INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS + DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER JPIV( * ) + DOUBLE PRECISION VN1( * ), VN2( * ) + COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + COMPLEX*16 CZERO, CONE + PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), + $ CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT, + $ MINMNUPDT + DOUBLE PRECISION HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z + COMPLEX*16 AIKK +* .. +* .. External Subroutines .. + EXTERNAL ZLARF, ZLARFG, ZSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL DISNAN + INTEGER IDAMAX + DOUBLE PRECISION DLAMCH, DZNRM2 + EXTERNAL DISNAN, DLAMCH, IDAMAX, DZNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* +* MINMNUPDT is the smallest dimension +* of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which +* contains the submatrices A(IOFFSET+1:M,1:N) and +* B(IOFFSET+1:M,1:NRHS) as column blocks. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + KMAX = MIN( KMAX, MINMNFACT ) + TOL3Z = SQRT( DLAMCH( 'Epsilon' ) ) + HUGEVAL = DLAMCH( 'Overflow' ) +* +* Compute the factorization, KK is the lomn loop index. +* + DO KK = 1, KMAX +* + I = IOFFSET + KK +* + IF( I.EQ.1 ) THEN +* +* ============================================================ +* +* We are at the first column of the original whole matrix A, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + KP = KP1 +* +* ============================================================ +* + ELSE +* +* ============================================================ +* +* Determine the pivot column in KK-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( KK-1 ) + IDAMAX( N-KK+1, VN1( KK ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,KK:N) in step KK. +* RELMAXC2NRMK will be computed later, after somecondition +* checks on MAXC2NRMK. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains NaN, and set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( DISNAN( MAXC2NRMK ) ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + INFO = K + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* Array TAU(K+1:MINMNFACT) is not set and contains +* undefined elements. +* + RETURN + END IF +* +* ============================================================ +* +* Quick return, if the submatrix A(I:M,KK:N) is +* a zero matrix. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + RELMAXC2NRMK = ZERO +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + KK - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third stopping criteria. +* NOTE: There is no need to test for ABSTOL >= ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL >= ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. + + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* +* Set K, the number of factorized columns. +* + K = KK - 1 +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,KK:N): +* 1) swap the KK-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) copy the KK-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than KK in the next loop step.) +* 3) Save the pivot interchange with the indices relative to the +* the original matrix A, not the block A(1:M,1:N). +* + IF( KP.NE.KK ) THEN + CALL ZSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 ) + VN1( KP ) = VN1( KK ) + VN2( KP ) = VN2( KK ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( KK ) + JPIV( KK ) = ITEMP + END IF +* +* Generate elementary reflector H(KK) using the column A(I:M,KK), +* if the column has more than one element, otherwise +* the elementary reflector would be an identity matrix, +* and TAU(KK) = CZERO. +* + IF( I.LT.M ) THEN + CALL ZLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1, + $ TAU( KK ) ) + ELSE + TAU( KK ) = CZERO + END IF +* +* Check if TAU(KK) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(KK) for Inf, +* since ZLARFG cannot produce TAU(KK) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by ZLARFG can contain Inf, which requires +* TAU(KK) to contain NaN. Therefore, this case of generating Inf +* by ZLARFG is covered by checking TAU(KK) for NaN. +* + IF( DISNAN( DBLE( TAU(KK) ) ) ) THEN + TAUNAN = DBLE( TAU(KK) ) + ELSE IF( DISNAN( DIMAG( TAU(KK) ) ) ) THEN + TAUNAN = DIMAG( TAU(KK) ) + ELSE + TAUNAN = ZERO + END IF +* + IF( DISNAN( TAUNAN ) ) THEN + K = KK - 1 + INFO = KK +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAUNAN + RELMAXC2NRMK = TAUNAN +* +* Array TAU(KK:MINMNFACT) is not set and contains +* undefined elements, except the first element TAU(KK) = NaN. +* + RETURN + END IF +* +* Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left. +* ( If M >= N, then at KK = N there is no residual matrix, +* i.e. no columns of A to update, only columns of B. +* If M < N, then at KK = M-IOFFSET, I = M and we have a +* one-row residual matrix in A and the elementary +* reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update +* is needed for the residual matrix in A and the +* right-hand-side-matrix in B. +* Therefore, we update only if +* KK < MINMNUPDT = min(M-IOFFSET, N+NRHS) +* condition is satisfied, not only KK < N+NRHS ) +* + IF( KK.LT.MINMNUPDT ) THEN + AIKK = A( I, KK ) + A( I, KK ) = CONE + CALL ZLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1, + $ DCONJG( TAU( KK ) ), A( I, KK+1 ), LDA, + $ WORK( 1 ) ) + A( I, KK ) = AIKK + END IF +* + IF( KK.LT.MINMNFACT ) THEN +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. +* when KK < min(M-IOFFSET, N). +* + DO J = KK + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2 + TEMP = MAX( TEMP, ZERO ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2 .LE. TOL3Z ) THEN +* +* Compute the column 2-norm for the partial +* column A(I+1:M,J) by explicitly computing it, +* and store it in both partial 2-norm vector VN1 +* and exact column 2-norm vector VN2. +* + VN1( J ) = DZNRM2( M-I, A( I+1, J ), 1 ) + VN2( J ) = VN1( J ) +* + ELSE +* +* Update the column 2-norm for the partial +* column A(I+1:M,J) by removing one +* element A(I,J) and store it in partial +* 2-norm vector VN1. +* + VN1( J ) = VN1( J )*SQRT( TEMP ) +* + END IF + END IF + END DO +* + END IF +* +* End factorization loop +* + END DO +* +* If we reached this point, all colunms have been factorized, +* i.e. no condition was triggered to exit the routine. +* Set the number of factorized columns. +* + K = KMAX +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before +* we return. +* + IF( K.LT.MINMNFACT ) THEN +* + JMAXC2NRM = K + IDAMAX( N-K, VN1( K+1 ), 1 ) + MAXC2NRMK = VN1( JMAXC2NRM ) +* + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + ELSE + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + END IF +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, set TAUs corresponding to the columns that were +* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO. +* + DO J = K + 1, MINMNFACT + TAU( J ) = CZERO + END DO +* + RETURN +* +* End of ZLAQP2RK +* + END diff --git a/lapack-netlib/SRC/zlaqp3rk.c b/lapack-netlib/SRC/zlaqp3rk.c new file mode 100644 index 000000000..cb44e4d34 --- /dev/null +++ b/lapack-netlib/SRC/zlaqp3rk.c @@ -0,0 +1,1157 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + z__1.r = -1., z__1.i = 0.; + zgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, + kb, &z__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* Quick return, if the submatrix A(I:M,K:N) is */ +/* a zero matrix. We need to check it only if the column index */ +/* (same as row index) is larger than 1, since the condition */ +/* for the whole original matrix A_orig is checked in the main */ +/* routine. */ + + if (*maxc2nrmk == 0.) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *relmaxc2nrmk = 0.; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix is zero and we stop the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + z__1.r = -1., z__1.i = 0.; + zgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, + kb, &z__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* Check if the submatrix A(I:M,K:N) contains Inf, */ +/* set INFO parameter to the column number, where */ +/* the first Inf is found plus N, and continue */ +/* the computation. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + if (*info == 0 && *maxc2nrmk > myhugeval) { + *info = *n + k - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third tolerance stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL.GE.ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig; */ + + *kb = k - 1; + if__ = i__ - 1; + +/* Apply the block reflector to the residual of the */ +/* matrix A and the residual of the right hand sides B, if */ +/* the residual matrix and and/or the residual of the right */ +/* hand sides exist, i.e. if the submatrix */ +/* A(I+1:M,KB+1:N+NRHS) exists. This occurs when */ +/* KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + z__1.r = -1., z__1.i = 0.; + zgemm_("No transpose", "Conjugate transpose", &i__1, & + i__2, kb, &z__1, &a[if__ + 1 + a_dim1], lda, &f[* + kb + 1 + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*kb + + 1) * a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,K:N): */ +/* 1) swap the K-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */ +/* 3) copy the K-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. (Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than K in the next loop step.) */ +/* 4) Save the pivot interchange with the indices relative to the */ +/* the original matrix A_orig, not the block A(1:M,1:N). */ + + if (kp != k) { + zswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + i__1 = k - 1; + zswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf); + vn1[kp] = vn1[k]; + vn2[kp] = vn2[k]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[k]; + jpiv[k] = itemp; + } + +/* Apply previous Householder reflectors to column K: */ +/* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H. */ + + if (k > 1) { + i__1 = k - 1; + for (j = 1; j <= i__1; ++j) { + i__2 = k + j * f_dim1; + d_cnjg(&z__1, &f[k + j * f_dim1]); + f[i__2].r = z__1.r, f[i__2].i = z__1.i; + } + i__1 = *m - i__ + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = 0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[i__ + a_dim1], lda, + &f[k + f_dim1], ldf, &c_b2, &a[i__ + k * a_dim1], &c__1); + i__1 = k - 1; + for (j = 1; j <= i__1; ++j) { + i__2 = k + j * f_dim1; + d_cnjg(&z__1, &f[k + j * f_dim1]); + f[i__2].r = z__1.r, f[i__2].i = z__1.i; + } + } + +/* Generate elementary reflector H(k) using the column A(I:M,K). */ + + if (i__ < *m) { + i__1 = *m - i__ + 1; + zlarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], & + c__1, &tau[k]); + } else { + i__1 = k; + tau[i__1].r = 0., tau[i__1].i = 0.; + } + +/* Check if TAU(K) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(K) for Inf, */ +/* since ZLARFG cannot produce TAU(KK) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by ZLARFG can contain Inf, which requires */ +/* TAU(K) to contain NaN. Therefore, this case of generating Inf */ +/* by ZLARFG is covered by checking TAU(K) for NaN. */ + + i__1 = k; + d__1 = tau[i__1].r; + if (disnan_(&d__1)) { + i__1 = k; + taunan = tau[i__1].r; + } else /* if(complicated condition) */ { + d__1 = d_imag(&tau[k]); + if (disnan_(&d__1)) { + taunan = d_imag(&tau[k]); + } else { + taunan = 0.; + } + } + + if (disnan_(&taunan)) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *info = k; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = taunan; + *relmaxc2nrmk = taunan; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix contains NaN and we stop */ +/* the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + z__1.r = -1., z__1.i = 0.; + zgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, kb, + &z__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * a_dim1], + lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* =============================================================== */ + + i__1 = i__ + k * a_dim1; + aik.r = a[i__1].r, aik.i = a[i__1].i; + i__1 = i__ + k * a_dim1; + a[i__1].r = 1., a[i__1].i = 0.; + +/* =============================================================== */ + +/* Compute the current K-th column of F: */ +/* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K). */ + + if (k < *n + *nrhs) { + i__1 = *m - i__ + 1; + i__2 = *n + *nrhs - k; + zgemv_("Conjugate transpose", &i__1, &i__2, &tau[k], &a[i__ + (k + + 1) * a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, & + f[k + 1 + k * f_dim1], &c__1); + } + +/* 2) Zero out elements above and on the diagonal of the */ +/* column K in matrix F, i.e elements F(1:K,K). */ + + i__1 = k; + for (j = 1; j <= i__1; ++j) { + i__2 = j + k * f_dim1; + f[i__2].r = 0., f[i__2].i = 0.; + } + +/* 3) Incremental updating of the K-th column of F: */ +/* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H */ +/* * A(I:M,K). */ + + if (k > 1) { + i__1 = *m - i__ + 1; + i__2 = k - 1; + i__3 = k; + z__1.r = -tau[i__3].r, z__1.i = -tau[i__3].i; + zgemv_("Conjugate Transpose", &i__1, &i__2, &z__1, &a[i__ + + a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, &auxv[1] + , &c__1); + + i__1 = *n + *nrhs; + i__2 = k - 1; + zgemv_("No transpose", &i__1, &i__2, &c_b2, &f[f_dim1 + 1], ldf, & + auxv[1], &c__1, &c_b2, &f[k * f_dim1 + 1], &c__1); + } + +/* =============================================================== */ + +/* Update the current I-th row of A: */ +/* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */ +/* - A(I,1:K)*F(K+1:N+NRHS,1:K)**H. */ + + if (k < *n + *nrhs) { + i__1 = *n + *nrhs - k; + z__1.r = -1., z__1.i = 0.; + zgemm_("No transpose", "Conjugate transpose", &c__1, &i__1, &k, & + z__1, &a[i__ + a_dim1], lda, &f[k + 1 + f_dim1], ldf, & + c_b2, &a[i__ + (k + 1) * a_dim1], lda); + } + + i__1 = i__ + k * a_dim1; + a[i__1].r = aik.r, a[i__1].i = aik.i; + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */ +/* when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */ + + if (k < minmnfact) { + + i__1 = *n; + for (j = k + 1; j <= i__1; ++j) { + if (vn1[j] != 0.) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + + temp = z_abs(&a[i__ + j * a_dim1]) / vn1[j]; +/* Computing MAX */ + d__1 = 0., d__2 = (temp + 1.) * (1. - temp); + temp = f2cmax(d__1,d__2); +/* Computing 2nd power */ + d__1 = vn1[j] / vn2[j]; + temp2 = temp * (d__1 * d__1); + if (temp2 <= tol3z) { + +/* At J-index, we have a difficult column for the */ +/* update of the 2-norm. Save the index of the previous */ +/* difficult column in IWORK(J-1). */ +/* NOTE: ILSTCC > 1, threfore we can use IWORK only */ +/* with N-1 elements, where the elements are */ +/* shifted by 1 to the left. */ + + iwork[j - 1] = lsticc; + +/* Set the index of the last difficult column LSTICC. */ + + lsticc = j; + + } else { + vn1[j] *= sqrt(temp); + } + } + } + + } + +/* End of while loop. */ + + } + +/* Now, afler the loop: */ +/* Set KB, the number of factorized columns in the block; */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig, IF = IOFFSET + KB. */ + + *kb = k; + if__ = i__; + +/* Apply the block reflector to the residual of the matrix A */ +/* and the residual of the right hand sides B, if the residual */ +/* matrix and and/or the residual of the right hand sides */ +/* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */ +/* This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + z__1.r = -1., z__1.i = 0.; + zgemm_("No transpose", "Conjugate transpose", &i__1, &i__2, kb, &z__1, + &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b2, + &a[if__ + 1 + (*kb + 1) * a_dim1], lda); + } + +/* Recompute the 2-norm of the difficult columns. */ +/* Loop over the index of the difficult columns from the largest */ +/* to the smallest index. */ + + while(lsticc > 0) { + +/* LSTICC is the index of the last difficult column is greater */ +/* than 1. */ +/* ITEMP is the index of the previous difficult column. */ + + itemp = iwork[lsticc - 1]; + +/* Compute the 2-norm explicilty for the last difficult column and */ +/* save it in the partial and exact 2-norm vectors VN1 and VN2. */ + +/* NOTE: The computation of VN1( LSTICC ) relies on the fact that */ +/* DZNRM2 does not fail on vectors with norm below the value of */ +/* SQRT(DLAMCH('S')) */ + + i__1 = *m - if__; + vn1[lsticc] = dznrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1); + vn2[lsticc] = vn1[lsticc]; + +/* Downdate the index of the last difficult column to */ +/* the index of the previous difficult column. */ + + lsticc = itemp; + + } + + return 0; + +/* End of ZLAQP3RK */ + +} /* zlaqp3rk_ */ + diff --git a/lapack-netlib/SRC/zlaqp3rk.f b/lapack-netlib/SRC/zlaqp3rk.f new file mode 100644 index 000000000..0dd8bf8e3 --- /dev/null +++ b/lapack-netlib/SRC/zlaqp3rk.f @@ -0,0 +1,947 @@ +*> \brief \b ZLAQP3RK computes a step of truncated QR factorization with column pivoting of a complex m-by-n matrix A using Level 3 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZLAQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, +* $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, +* $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) +* IMPLICIT NONE +* LOGICAL DONE +* INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, +* $ NB, NRHS +* DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* DOUBLE PRECISION VN1( * ), VN2( * ) +* COMPLEX*16 A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLAQP3RK computes a step of truncated QR factorization with column +*> pivoting of a complex M-by-N matrix A block A(IOFFSET+1:M,1:N) +*> by using Level 3 BLAS as +*> +*> A * P(KB) = Q(KB) * R(KB). +*> +*> The routine tries to factorize NB columns from A starting from +*> the row IOFFSET+1 and updates the residual matrix with BLAS 3 +*> xGEMM. The number of actually factorized columns is returned +*> is smaller than NB. +*> +*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides B matrix stored +*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**H * B. +*> +*> Cases when the number of factorized columns KB < NB: +*> +*> (1) In some cases, due to catastrophic cancellations, it cannot +*> factorize all NB columns and need to update the residual matrix. +*> Hence, the actual number of factorized columns in the block returned +*> in KB is smaller than NB. The logical DONE is returned as FALSE. +*> The factorization of the whole original matrix A_orig must proceed +*> with the next block. +*> +*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. +*> +*> (3) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix is a zero matrix in some factorization +*> step KB, the factorization of the whole original matrix A_orig is +*> stopped, the logical DONE is returned as TRUE. The number of +*> factorized columns which is smaller than NB is returned in KB. +*> +*> (4) Whenever NaN is detected in the matrix A or in the array TAU, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. The INFO +*> parameter is set to the column index of the first NaN occurrence. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0 +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> Factorization block size, i.e the number of columns +*> to factorize in the matrix A. 0 <= NB +*> +*> If NB = 0, then the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine ZGEQP3RK. 1 <= KP1 <= N_orig. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is DOUBLE PRECISION +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine ZGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:KB) below +*> the diagonal together with the array TAU represent +*> the unitary matrix Q(KB) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:KB) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS). +*> The left part A(IOFFSET+1:M,KB+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(KB)**H. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] +*> \verbatim +*> DONE is LOGICAL +*> TRUE: a) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to ABSTOL +*> or RELTOL criterion, +*> b) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to the +*> residual matrix being a ZERO matrix. +*> c) when NaN was detected in the matrix A +*> or in the array TAU. +*> FALSE: otherwise. +*> \endverbatim +*> +*> \param[out] KB +*> \verbatim +*> KB is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= KB <= min(M-IOFFSET,NB,N). +*> +*> KB also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank KB. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank KB) to the maximum column 2-norm of the +*> original matrix A_orig. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX*16 array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is DOUBLE PRECISION array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is DOUBLE PRECISION array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] AUXV +*> \verbatim +*> AUXV is COMPLEX*16 array, dimension (NB) +*> Auxiliary vector. +*> \endverbatim +*> +*> \param[out] F +*> \verbatim +*> F is COMPLEX*16 array, dimension (LDF,NB) +*> Matrix F**H = L*(Y**H)*A. +*> \endverbatim +*> +*> \param[in] LDF +*> \verbatim +*> LDF is INTEGER +*> The leading dimension of the array F. LDF >= max(1,N+NRHS). +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step KB+1 ( when KB columns +*> have been factorized ). +*> +*> On exit: +*> KB is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(KB+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=KB+1, TAU(KB+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the actorization +*> step KB+1 ( when KB columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp3rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> EECS Department, +*> University of California, Berkeley, USA. +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, + $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + LOGICAL DONE + INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, + $ NB, NRHS + DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + DOUBLE PRECISION VN1( * ), VN2( * ) + COMPLEX*16 A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + COMPLEX*16 CZERO, CONE + PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), + $ CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER ITEMP, J, K, MINMNFACT, MINMNUPDT, + $ LSTICC, KP, I, IF + DOUBLE PRECISION HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z + COMPLEX*16 AIK +* .. +* .. External Subroutines .. + EXTERNAL ZGEMM, ZGEMV, ZLARFG, ZSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL DISNAN + INTEGER IDAMAX + DOUBLE PRECISION DLAMCH, DZNRM2 + EXTERNAL DISNAN, DLAMCH, IDAMAX, DZNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + NB = MIN( NB, MINMNFACT ) + TOL3Z = SQRT( DLAMCH( 'Epsilon' ) ) + HUGEVAL = DLAMCH( 'Overflow' ) +* +* Compute factorization in a while loop over NB columns, +* K is the column index in the block A(1:M,1:N). +* + K = 0 + LSTICC = 0 + DONE = .FALSE. +* + DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 ) + K = K + 1 + I = IOFFSET + K +* + IF( I.EQ.1 ) THEN +* +* We are at the first column of the original whole matrix A_orig, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + KP = KP1 +* + ELSE +* +* Determine the pivot column in K-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( K-1 ) + IDAMAX( N-K+1, VN1( K ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,K:N) in step K. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains NaN, set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( DISNAN( MAXC2NRMK ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = KB + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. + + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* Quick return, if the submatrix A(I:M,K:N) is +* a zero matrix. We need to check it only if the column index +* (same as row index) is larger than 1, since the condition +* for the whole original matrix A_orig is checked in the main +* routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + RELMAXC2NRMK = ZERO +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix is zero and we stop the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. +* + DO J = K, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + K - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third tolerance stopping criteria. +* NOTE: There is no need to test for ABSTOL.GE.ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig; +* + KB = K - 1 + IF = I - 1 +* +* Apply the block reflector to the residual of the +* matrix A and the residual of the right hand sides B, if +* the residual matrix and and/or the residual of the right +* hand sides exist, i.e. if the submatrix +* A(I+1:M,KB+1:N+NRHS) exists. This occurs when +* KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, N+NRHS-KB, KB,-CONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. +* + DO J = K, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,K:N): +* 1) swap the K-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) +* 3) copy the K-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. (Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than K in the next loop step.) +* 4) Save the pivot interchange with the indices relative to the +* the original matrix A_orig, not the block A(1:M,1:N). +* + IF( KP.NE.K ) THEN + CALL ZSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 ) + CALL ZSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF ) + VN1( KP ) = VN1( K ) + VN2( KP ) = VN2( K ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( K ) + JPIV( K ) = ITEMP + END IF +* +* Apply previous Householder reflectors to column K: +* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H. +* + IF( K.GT.1 ) THEN + DO J = 1, K - 1 + F( K, J ) = DCONJG( F( K, J ) ) + END DO + CALL ZGEMV( 'No transpose', M-I+1, K-1, -CONE, A( I, 1 ), + $ LDA, F( K, 1 ), LDF, CONE, A( I, K ), 1 ) + DO J = 1, K - 1 + F( K, J ) = DCONJG( F( K, J ) ) + END DO + END IF +* +* Generate elementary reflector H(k) using the column A(I:M,K). +* + IF( I.LT.M ) THEN + CALL ZLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) ) + ELSE + TAU( K ) = CZERO + END IF +* +* Check if TAU(K) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(K) for Inf, +* since ZLARFG cannot produce TAU(KK) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by ZLARFG can contain Inf, which requires +* TAU(K) to contain NaN. Therefore, this case of generating Inf +* by ZLARFG is covered by checking TAU(K) for NaN. +* + IF( DISNAN( DBLE( TAU(K) ) ) ) THEN + TAUNAN = DBLE( TAU(K) ) + ELSE IF( DISNAN( DIMAG( TAU(K) ) ) ) THEN + TAUNAN = DIMAG( TAU(K) ) + ELSE + TAUNAN = ZERO + END IF +* + IF( DISNAN( TAUNAN ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = K +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAUNAN + RELMAXC2NRMK = TAUNAN +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* =============================================================== +* + AIK = A( I, K ) + A( I, K ) = CONE +* +* =============================================================== +* +* Compute the current K-th column of F: +* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K). +* + IF( K.LT.N+NRHS ) THEN + CALL ZGEMV( 'Conjugate transpose', M-I+1, N+NRHS-K, + $ TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1, + $ CZERO, F( K+1, K ), 1 ) + END IF +* +* 2) Zero out elements above and on the diagonal of the +* column K in matrix F, i.e elements F(1:K,K). +* + DO J = 1, K + F( J, K ) = CZERO + END DO +* +* 3) Incremental updating of the K-th column of F: +* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H +* * A(I:M,K). +* + IF( K.GT.1 ) THEN + CALL ZGEMV( 'Conjugate Transpose', M-I+1, K-1, -TAU( K ), + $ A( I, 1 ), LDA, A( I, K ), 1, CZERO, + $ AUXV( 1 ), 1 ) +* + CALL ZGEMV( 'No transpose', N+NRHS, K-1, CONE, + $ F( 1, 1 ), LDF, AUXV( 1 ), 1, CONE, + $ F( 1, K ), 1 ) + END IF +* +* =============================================================== +* +* Update the current I-th row of A: +* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) +* - A(I,1:K)*F(K+1:N+NRHS,1:K)**H. +* + IF( K.LT.N+NRHS ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ 1, N+NRHS-K, K, -CONE, A( I, 1 ), LDA, + $ F( K+1, 1 ), LDF, CONE, A( I, K+1 ), LDA ) + END IF +* + A( I, K ) = AIK +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. +* when K < MINMNFACT = min( M-IOFFSET, N ). +* + IF( K.LT.MINMNFACT ) THEN +* + DO J = K + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ABS( A( I, J ) ) / VN1( J ) + TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2.LE.TOL3Z ) THEN +* +* At J-index, we have a difficult column for the +* update of the 2-norm. Save the index of the previous +* difficult column in IWORK(J-1). +* NOTE: ILSTCC > 1, threfore we can use IWORK only +* with N-1 elements, where the elements are +* shifted by 1 to the left. +* + IWORK( J-1 ) = LSTICC +* +* Set the index of the last difficult column LSTICC. +* + LSTICC = J +* + ELSE + VN1( J ) = VN1( J )*SQRT( TEMP ) + END IF + END IF + END DO +* + END IF +* +* End of while loop. +* + END DO +* +* Now, afler the loop: +* Set KB, the number of factorized columns in the block; +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig, IF = IOFFSET + KB. +* + KB = K + IF = I +* +* Apply the block reflector to the residual of the matrix A +* and the residual of the right hand sides B, if the residual +* matrix and and/or the residual of the right hand sides +* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. +* This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, N+NRHS-KB, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* Recompute the 2-norm of the difficult columns. +* Loop over the index of the difficult columns from the largest +* to the smallest index. +* + DO WHILE( LSTICC.GT.0 ) +* +* LSTICC is the index of the last difficult column is greater +* than 1. +* ITEMP is the index of the previous difficult column. +* + ITEMP = IWORK( LSTICC-1 ) +* +* Compute the 2-norm explicilty for the last difficult column and +* save it in the partial and exact 2-norm vectors VN1 and VN2. +* +* NOTE: The computation of VN1( LSTICC ) relies on the fact that +* DZNRM2 does not fail on vectors with norm below the value of +* SQRT(DLAMCH('S')) +* + VN1( LSTICC ) = DZNRM2( M-IF, A( IF+1, LSTICC ), 1 ) + VN2( LSTICC ) = VN1( LSTICC ) +* +* Downdate the index of the last difficult column to +* the index of the previous difficult column. +* + LSTICC = ITEMP +* + END DO +* + RETURN +* +* End of ZLAQP3RK +* + END diff --git a/lapack-netlib/SRC/zlaswlq.f b/lapack-netlib/SRC/zlaswlq.f index be4c48539..735207132 100644 --- a/lapack-netlib/SRC/zlaswlq.f +++ b/lapack-netlib/SRC/zlaswlq.f @@ -96,22 +96,23 @@ *> The leading dimension of the array T. LDT >= MB. *> \endverbatim *> -*> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= MB*M. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -159,33 +160,37 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup laswlq +*> * ===================================================================== SUBROUTINE ZLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), T( LDT, *) + COMPLEX*16 A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL ZGELQT, ZTPLQT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -196,12 +201,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LE.0 ) THEN INFO = -4 @@ -209,60 +221,61 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = MB*M +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN - CALL ZGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN + CALL ZGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO ) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL ZGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL ZGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M, (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL ZTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), - $ LDA, T(1, CTR * M + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL ZTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + $ LDA, T(1, CTR * M + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL ZTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), - $ LDA, T(1, CTR * M + 1), LDT, - $ WORK, INFO ) - END IF + $ LDA, T(1, CTR * M + 1), LDT, + $ WORK, INFO ) + END IF * - WORK( 1 ) = M * MB + WORK( 1 ) = LWMIN RETURN * * End of ZLASWLQ diff --git a/lapack-netlib/SRC/zlatrs3.f b/lapack-netlib/SRC/zlatrs3.f index 231a17274..27eac839b 100644 --- a/lapack-netlib/SRC/zlatrs3.f +++ b/lapack-netlib/SRC/zlatrs3.f @@ -158,7 +158,11 @@ *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -166,6 +170,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -182,7 +187,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -257,7 +262,7 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. @@ -296,15 +301,24 @@ * row. WORK( I + KK * LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = LWMIN * * Test the input parameters. * @@ -326,7 +340,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zlatsqr.f b/lapack-netlib/SRC/zlatsqr.f index 8c938aebc..24d00f28a 100644 --- a/lapack-netlib/SRC/zlatsqr.f +++ b/lapack-netlib/SRC/zlatsqr.f @@ -101,15 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= NB*N. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -161,33 +164,37 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup latsqr +*> * ===================================================================== SUBROUTINE ZLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), T(LDT, *) + COMPLEX*16 A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, LWMIN, MINMN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. EXTERNAL SUBROUTINES .. - EXTERNAL ZGEQRT, ZTPQRT, XERBLA + EXTERNAL ZGEQRT, ZTPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -198,6 +205,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -205,64 +219,65 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = NB*N +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL ZGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL ZGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL ZGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) - CTR = 1 + CALL ZGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = MB+1, II-MB+N , (MB-N) + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL ZTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, - $ T(1, CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL ZTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + $ T(1, CTR * N + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL ZTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, - $ T(1,CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + IF( II.LE.M ) THEN + CALL ZTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + $ T(1,CTR * N + 1), LDT, + $ WORK, INFO ) + END IF * - work( 1 ) = N*NB + WORK( 1 ) = LWMIN RETURN * * End of ZLATSQR diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt index b4e2223f7..d4e6f970d 100644 --- a/lapack-netlib/TESTING/CMakeLists.txt +++ b/lapack-netlib/TESTING/CMakeLists.txt @@ -54,6 +54,9 @@ add_lapack_test(sgqr.out gqr.in xeigtsts) add_lapack_test(sgsv.out gsv.in xeigtsts) add_lapack_test(scsd.out csd.in xeigtsts) add_lapack_test(slse.out lse.in xeigtsts) +# +# ======== SINGLE DMD EIG TESTS =========================== +add_lapack_test(sdmd.out sdmd.in xdmdeigtsts) endif() if(BUILD_DOUBLE) @@ -85,6 +88,9 @@ add_lapack_test(dgqr.out gqr.in xeigtstd) add_lapack_test(dgsv.out gsv.in xeigtstd) add_lapack_test(dcsd.out csd.in xeigtstd) add_lapack_test(dlse.out lse.in xeigtstd) +# +# ======== DOUBLE DMD EIG TESTS =========================== +add_lapack_test(ddmd.out ddmd.in xdmdeigtstd) endif() if(BUILD_COMPLEX) @@ -114,6 +120,9 @@ add_lapack_test(cgqr.out gqr.in xeigtstc) add_lapack_test(cgsv.out gsv.in xeigtstc) add_lapack_test(ccsd.out csd.in xeigtstc) add_lapack_test(clse.out lse.in xeigtstc) +# +# ======== COMPLEX DMD EIG TESTS =========================== +add_lapack_test(cdmd.out cdmd.in xdmdeigtstc) endif() if(BUILD_COMPLEX16) @@ -145,6 +154,9 @@ add_lapack_test(zgqr.out gqr.in xeigtstz) add_lapack_test(zgsv.out gsv.in xeigtstz) add_lapack_test(zcsd.out csd.in xeigtstz) add_lapack_test(zlse.out lse.in xeigtstz) +# +# ======== COMPLEX16 DMD EIG TESTS =========================== +add_lapack_test(zdmd.out zdmd.in xdmdeigtstz) endif() diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index d252c7fa9..e7236677a 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -42,6 +42,8 @@ set(SEIGTST schkee.F sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f sstt22.f ssyl01.f ssyt21.f ssyt22.f) +set(SDMDEIGTST schkdmd.f90) + set(CEIGTST cchkee.F cbdt01.f cbdt02.f cbdt03.f cbdt05.f cchkbb.f cchkbd.f cchkbk.f cchkbl.f cchkec.f @@ -59,6 +61,8 @@ set(CEIGTST cchkee.F csgt01.f cslect.f csyl01.f cstt21.f cstt22.f cunt01.f cunt03.f) +set(CDMDEIGTST cchkdmd.f90) + set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f dsvdch.f dsvdct.f dsxt1.f) @@ -79,6 +83,8 @@ set(DEIGTST dchkee.F dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f dstt22.f dsyl01.f dsyt21.f dsyt22.f) +set(DDMDEIGTST dchkdmd.f90) + set(ZEIGTST zchkee.F zbdt01.f zbdt02.f zbdt03.f zbdt05.f zchkbb.f zchkbd.f zchkbk.f zchkbl.f zchkec.f @@ -96,6 +102,8 @@ set(ZEIGTST zchkee.F zsgt01.f zslect.f zsyl01.f zstt21.f zstt22.f zunt01.f zunt03.f) +set(ZDMDEIGTST zchkdmd.f90) + macro(add_eig_executable name) add_executable(${name} ${ARGN}) target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) @@ -104,16 +112,20 @@ endmacro() if(BUILD_SINGLE) add_eig_executable(xeigtsts ${SEIGTST} ${SCIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtsts ${SDMDEIGTST}) endif() if(BUILD_COMPLEX) add_eig_executable(xeigtstc ${CEIGTST} ${SCIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtstc ${CDMDEIGTST}) endif() if(BUILD_DOUBLE) add_eig_executable(xeigtstd ${DEIGTST} ${DZIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtstd ${DDMDEIGTST}) endif() if(BUILD_COMPLEX16) add_eig_executable(xeigtstz ${ZEIGTST} ${DZIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtstz ${ZDMDEIGTST}) endif() diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index 942ae6982..9cf0fc95e 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -64,6 +64,8 @@ SEIGTST = schkee.o \ sort03.o ssbt21.o ssgt01.o sslect.o sspt21.o sstt21.o \ sstt22.o ssyl01.o ssyt21.o ssyt22.o +SDMDEIGTST = schkdmd.o + CEIGTST = cchkee.o \ cbdt01.o cbdt02.o cbdt03.o cbdt05.o \ cchkbb.o cchkbd.o cchkbk.o cchkbl.o cchkec.o \ @@ -81,6 +83,8 @@ CEIGTST = cchkee.o \ csgt01.o cslect.o csyl01.o\ cstt21.o cstt22.o cunt01.o cunt03.o +CDMDEIGTST = cchkdmd.o + DZIGTST = dlafts.o dlahd2.o dlasum.o dlatb9.o dstech.o dstect.o \ dsvdch.o dsvdct.o dsxt1.o @@ -101,6 +105,8 @@ DEIGTST = dchkee.o \ dort03.o dsbt21.o dsgt01.o dslect.o dspt21.o dstt21.o \ dstt22.o dsyl01.o dsyt21.o dsyt22.o +DDMDEIGTST = dchkdmd.o + ZEIGTST = zchkee.o \ zbdt01.o zbdt02.o zbdt03.o zbdt05.o \ zchkbb.o zchkbd.o zchkbk.o zchkbl.o zchkec.o \ @@ -118,14 +124,28 @@ ZEIGTST = zchkee.o \ zsgt01.o zslect.o zsyl01.o\ zstt21.o zstt22.o zunt01.o zunt03.o +ZDMDEIGTST = zchkdmd.o + .PHONY: all all: single complex double complex16 .PHONY: single complex double complex16 -single: xeigtsts -complex: xeigtstc -double: xeigtstd -complex16: xeigtstz +single: xeigtsts xdmdeigtsts +complex: xeigtstc xdmdeigtstc +double: xeigtstd xdmdeigtstd +complex16: xeigtstz xdmdeigtstz + +xdmdeigtsts: $(SDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ + +xdmdeigtstc: $(CDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ + +xdmdeigtstd: $(DDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ + +xdmdeigtstz: $(ZDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ @@ -139,6 +159,10 @@ xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +$(SDMDEIGTST): $(FRC) +$(CDMDEIGTST): $(FRC) +$(DDMDEIGTST): $(FRC) +$(ZDMDEIGTST): $(FRC) $(AEIGTST): $(FRC) $(SCIGTST): $(FRC) $(DZIGTST): $(FRC) @@ -155,7 +179,7 @@ clean: cleanobj cleanexe cleanobj: rm -f *.o cleanexe: - rm -f xeigtst* + rm -f xeigtst* xdmdeigtst* schkee.o: schkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< @@ -165,3 +189,11 @@ cchkee.o: cchkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< zchkee.o: zchkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< +schkdmd.o: schkdmd.f90 + $(FC) $(FFLAGS_DRV) -c -o $@ $< +cchkdmd.o: cchkdmd.f90 + $(FC) $(FFLAGS_DRV) -c -o $@ $< +dchkdmd.o: dchkdmd.f90 + $(FC) $(FFLAGS_DRV) -c -o $@ $< +zchkdmd.o: zchkdmd.f90 + $(FC) $(FFLAGS_DRV) -c -o $@ $< diff --git a/lapack-netlib/TESTING/EIG/alareq.f b/lapack-netlib/TESTING/EIG/alareq.f index 2585a686a..2cbe6db38 100644 --- a/lapack-netlib/TESTING/EIG/alareq.f +++ b/lapack-netlib/TESTING/EIG/alareq.f @@ -28,12 +28,12 @@ *> to evaluate the input line which requested NMATS matrix types for *> PATH. The flow of control is as follows: *> -*> If NMATS = NTYPES then +*> IF NMATS = NTYPES THEN *> DOTYPE(1:NTYPES) = .TRUE. -*> else +*> ELSE *> Read the next input line for NMATS matrix types *> Set DOTYPE(I) = .TRUE. for each valid type I -*> endif +*> END IF *> \endverbatim * * Arguments: diff --git a/lapack-netlib/TESTING/EIG/alarqg.f b/lapack-netlib/TESTING/EIG/alarqg.f index 6e2e6e7ec..b9fb88c65 100644 --- a/lapack-netlib/TESTING/EIG/alarqg.f +++ b/lapack-netlib/TESTING/EIG/alarqg.f @@ -28,12 +28,12 @@ *> to evaluate the input line which requested NMATS matrix types for *> PATH. The flow of control is as follows: *> -*> If NMATS = NTYPES then +*> IF NMATS = NTYPES THEN *> DOTYPE(1:NTYPES) = .TRUE. -*> else +*> ELSE *> Read the next input line for NMATS matrix types *> Set DOTYPE(I) = .TRUE. for each valid type I -*> endif +*> END IF *> \endverbatim * * Arguments: diff --git a/lapack-netlib/TESTING/EIG/cchkdmd.f90 b/lapack-netlib/TESTING/EIG/cchkdmd.f90 new file mode 100644 index 000000000..a9c181da9 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/cchkdmd.f90 @@ -0,0 +1,721 @@ +! This is a test program for checking the implementations of +! the implementations of the following subroutines +! +! CGEDMD, for computation of the +! Dynamic Mode Decomposition (DMD) +! CGEDMDQ, for computation of a +! QR factorization based compressed DMD +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! ======================================================== +! How to run the code (compiler, link info) +! ======================================================== +! Compile as FORTRAN 90 (or later) and link with BLAS and +! LAPACK libraries. +! NOTE: The code is developed and tested on top of the +! Intel MKL library (versions 2022.0.3 and 2022.2.0), +! using the Intel Fortran compiler. +! +! For developers of the C++ implementation +! ======================================================== +! See the LAPACK++ and Template Numerical Toolkit (TNT) +! +! Note on a development of the GPU HP implementation +! ======================================================== +! Work in progress. See CUDA, MAGMA, SLATE. +! NOTE: The four SVD subroutines used in this code are +! included as a part of R&D and for the completeness. +! This was also an opportunity to test those SVD codes. +! If the scaling option is used all four are essentially +! equally good. For implementations on HP platforms, +! one can use whichever SVD is available. +!............................................................ + +!............................................................ +!............................................................ +! + PROGRAM DMD_TEST + + use iso_fortran_env + IMPLICIT NONE + integer, parameter :: WP = real32 +!............................................................ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP + + COMPLEX(KIND=WP), PARAMETER :: CONE = ( 1.0_WP, 0.0_WP ) + COMPLEX(KIND=WP), PARAMETER :: CZERO = ( 0.0_WP, 0.0_WP ) +!............................................................ + REAL(KIND=WP), ALLOCATABLE, DIMENSION(:) :: RES, & + RES1, RESEX, SINGVX, SINGVQX, WORK + INTEGER , ALLOCATABLE, DIMENSION(:) :: IWORK + REAL(KIND=WP) :: WDUMMY(2) + INTEGER :: IDUMMY(4), ISEED(4) + REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, EPS, & + TOL, TOL2, SVDIFF, TMP, TMP_AU, & + TMP_FQR, TMP_REZ, TMP_REZQ, TMP_XW, & + TMP_EX +!............................................................ + COMPLEX(KIND=WP) :: CMAX + INTEGER :: LCWORK + COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:,:) :: A, AC, & + AU, F, F0, F1, S, W, & + X, X0, Y, Y0, Y1, Z, Z1 + COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:) :: CDA, CDR, & + CDL, CEIGS, CEIGSA, CWORK + COMPLEX(KIND=WP) :: CDUMMY(22), CDUM2X2(2,2) +!............................................................ + INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY, & + LDZ, LIWORK, LWORK, M, N, LLOOP, NRNK + INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, j, & + NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ, & + NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL, & + NFAIL_Z_XV, MODE, MODEL, MODER, WHTSVD + INTEGER :: iNRNK, iWHTSVD, K_traj, LWMINOPT + CHARACTER :: GRADE, JOBREF, JOBZ, PIVTNG, RSIGN, & + SCALE, RESIDS, WANTQ, WANTR + LOGICAL :: TEST_QRDMD + +!..... external subroutines (BLAS and LAPACK) + EXTERNAL CAXPY, CGEEV, CGEMM, CGEMV, CLASCL +!.....external subroutines DMD package +! subroutines under test + EXTERNAL CGEDMD, CGEDMDQ +!..... external functions (BLAS and LAPACK) + EXTERNAL SCNRM2, SLAMCH + REAL(KIND=WP) :: SCNRM2, SLAMCH + EXTERNAL CLANGE + REAL(KIND=WP) :: CLANGE + EXTERNAL ICAMAX + INTEGER ICAMAX + EXTERNAL LSAME + LOGICAL LSAME + + INTRINSIC ABS, INT, MIN, MAX, SIGN +!............................................................ + + + WRITE(*,*) 'COMPLEX CODE TESTING' + + ! The test is always in pairs : ( CGEDMD and CGEDMDQ) + ! because the test includes comparing the results (in pairs). +!..................................................................................... + ! This code by default performs tests on CGEDMDQ + ! Since the QR factorizations based algorithm is designed for + ! single trajectory data, only single trajectory tests will + ! be performed with xGEDMDQ. + + WANTQ = 'Q' + WANTR = 'R' +!................................................................................. + + EPS = SLAMCH( 'P' ) ! machine precision WP + + ! Global counters of failures of some particular tests + NFAIL = 0 + NFAIL_REZ = 0 + NFAIL_REZQ = 0 + NFAIL_Z_XV = 0 + NFAIL_F_QR = 0 + NFAIL_AU = 0 + NFAIL_SVDIFF = 0 + NFAIL_TOTAL = 0 + NFAILQ_TOTAL = 0 + + DO LLOOP = 1, 4 + + WRITE(*,*) 'L Loop Index = ', LLOOP + + ! Set the dimensions of the problem ... + READ(*,*) M + WRITE(*,*) 'M = ', M + ! ... and the number of snapshots. + READ(*,*) N + WRITE(*,*) 'N = ', N + + ! Test the dimensions + IF ( ( MIN(M,N) == 0 ) .OR. ( M < N ) ) THEN + WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.' + STOP + END IF +!............. + ! The seed inside the LLOOP so that each pass can be reproduced easily. + ISEED(1) = 4 + ISEED(2) = 3 + ISEED(3) = 2 + ISEED(4) = 1 + + LDA = M + LDF = M + LDX = M + LDY = M + LDW = N + LDZ = M + LDAU = M + LDS = N + + TMP_XW = ZERO + TMP_AU = ZERO + TMP_REZ = ZERO + TMP_REZQ = ZERO + SVDIFF = ZERO + TMP_EX = ZERO + + ALLOCATE( A(LDA,M) ) + ALLOCATE( AC(LDA,M) ) + ALLOCATE( F(LDF,N+1) ) + ALLOCATE( F0(LDF,N+1) ) + ALLOCATE( F1(LDF,N+1) ) + ALLOCATE( X(LDX,N) ) + ALLOCATE( X0(LDX,N) ) + ALLOCATE( Y(LDY,N+1) ) + ALLOCATE( Y0(LDY,N+1) ) + ALLOCATE( Y1(LDY,N+1) ) + ALLOCATE( AU(LDAU,N) ) + ALLOCATE( W(LDW,N) ) + ALLOCATE( S(LDS,N) ) + ALLOCATE( Z(LDZ,N) ) + ALLOCATE( Z1(LDZ,N) ) + ALLOCATE( RES(N) ) + ALLOCATE( RES1(N) ) + ALLOCATE( RESEX(N) ) + ALLOCATE( CEIGS(N) ) + ALLOCATE( SINGVX(N) ) + ALLOCATE( SINGVQX(N) ) + + TOL = 10*M*EPS + TOL2 = 10*M*N*EPS + +!............. + + DO K_traj = 1, 2 + ! Number of intial conditions in the simulation/trajectories (1 or 2) + + COND = 1.0D4 + CMAX = (1.0D1,1.0D1) + RSIGN = 'F' + GRADE = 'N' + MODEL = 6 + CONDL = 1.0D1 + MODER = 6 + CONDR = 1.0D1 + PIVTNG = 'N' + ! Loop over all parameter MODE values for CLATMR (+-1,..,+-6) + + DO MODE = 1, 6 + + ALLOCATE( IWORK(2*M) ) + ALLOCATE( CDA(M) ) + ALLOCATE( CDL(M) ) + ALLOCATE( CDR(M) ) + + CALL CLATMR( M, M, 'N', ISEED, 'N', CDA, MODE, COND, & + CMAX, RSIGN, GRADE, CDL, MODEL, CONDL, & + CDR, MODER, CONDR, PIVTNG, IWORK, M, M, & + ZERO, -ONE, 'N', A, LDA, IWORK(M+1), INFO ) + DEALLOCATE( CDR ) + DEALLOCATE( CDL ) + DEALLOCATE( CDA ) + DEALLOCATE( IWORK ) + + LCWORK = MAX(1,2*M) + ALLOCATE( CEIGSA(M) ) + ALLOCATE( CWORK(LCWORK) ) + ALLOCATE( WORK(2*M) ) + AC(1:M,1:M) = A(1:M,1:M) + CALL CGEEV( 'N','N', M, AC, LDA, CEIGSA, CDUM2X2, 2, & + CDUM2X2, 2, CWORK, LCWORK, WORK, INFO ) ! LAPACK CALL + DEALLOCATE(WORK) + DEALLOCATE(CWORK) + + TMP = ABS(CEIGSA(ICAMAX(M, CEIGSA, 1))) ! The spectral radius of A + ! Scale the matrix A to have unit spectral radius. + CALL CLASCL( 'G',0, 0, TMP, ONE, M, M, & + A, LDA, INFO ) + CALL CLASCL( 'G',0, 0, TMP, ONE, M, 1, & + CEIGSA, M, INFO ) + ANORM = CLANGE( 'F', M, M, A, LDA, WDUMMY ) + + IF ( K_traj == 2 ) THEN + ! generate data as two trajectories + ! with two inital conditions + CALL CLARNV(2, ISEED, M, F(1,1) ) + DO i = 1, N/2 + CALL CGEMV( 'N', M, M, CONE, A, LDA, F(1,i), 1, & + CZERO, F(1,i+1), 1 ) + END DO + X0(1:M,1:N/2) = F(1:M,1:N/2) + Y0(1:M,1:N/2) = F(1:M,2:N/2+1) + + CALL CLARNV(2, ISEED, M, F(1,1) ) + DO i = 1, N-N/2 + CALL CGEMV( 'N', M, M, CONE, A, LDA, F(1,i), 1, & + CZERO, F(1,i+1), 1 ) + END DO + X0(1:M,N/2+1:N) = F(1:M,1:N-N/2) + Y0(1:M,N/2+1:N) = F(1:M,2:N-N/2+1) + ELSE + CALL CLARNV(2, ISEED, M, F(1,1) ) + DO i = 1, N + CALL CGEMV( 'N', M, M, CONE, A, M, F(1,i), 1, & + CZERO, F(1,i+1), 1 ) + END DO + F0(1:M,1:N+1) = F(1:M,1:N+1) + X0(1:M,1:N) = F0(1:M,1:N) + Y0(1:M,1:N) = F0(1:M,2:N+1) + END IF + + DEALLOCATE( CEIGSA ) +!........................................................................ + + DO iJOBZ = 1, 4 + + SELECT CASE ( iJOBZ ) + CASE(1) + JOBZ = 'V' + RESIDS = 'R' + CASE(2) + JOBZ = 'V' + RESIDS = 'N' + CASE(3) + JOBZ = 'F' + RESIDS = 'N' + CASE(4) + JOBZ = 'N' + RESIDS = 'N' + END SELECT + + DO iJOBREF = 1, 3 + + SELECT CASE ( iJOBREF ) + CASE(1) + JOBREF = 'R' + CASE(2) + JOBREF = 'E' + CASE(3) + JOBREF = 'N' + END SELECT + + DO iSCALE = 1, 4 + + SELECT CASE ( iSCALE ) + CASE(1) + SCALE = 'S' + CASE(2) + SCALE = 'C' + CASE(3) + SCALE = 'Y' + CASE(4) + SCALE = 'N' + END SELECT + + DO iNRNK = -1, -2, -1 + NRNK = iNRNK + + DO iWHTSVD = 1, 3 + ! Check all four options to compute the POD basis + ! via the SVD. + WHTSVD = iWHTSVD + + DO LWMINOPT = 1, 2 + ! Workspace query for the minimal (1) and for the optimal + ! (2) workspace lengths determined by workspace query. + + ! CGEDMD is always tested and its results are also used for + ! comparisons with CGEDMDQ. + + X(1:M,1:N) = X0(1:M,1:N) + Y(1:M,1:N) = Y0(1:M,1:N) + + CALL CGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, CEIGS, Z, LDZ, RES, & + AU, LDAU, W, LDW, S, LDS, & + CDUMMY, -1, WDUMMY, -1, IDUMMY, -1, INFO ) + + IF ( (INFO .EQ. 2) .OR. ( INFO .EQ. 3 ) & + .OR. ( INFO < 0 ) ) THEN + WRITE(*,*) 'Call to CGEDMD workspace query failed. & + &Check the calling sequence and the code.' + WRITE(*,*) 'The error code is ', INFO + WRITE(*,*) 'The input parameters were ', & + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL, LDZ, LDAU, LDW, LDS + STOP + ELSE + !WRITE(*,*) '... done. Workspace length computed.' + END IF + + LCWORK = INT(CDUMMY(LWMINOPT)) + ALLOCATE(CWORK(LCWORK)) + LIWORK = IDUMMY(1) + ALLOCATE(IWORK(LIWORK)) + LWORK = INT(WDUMMY(1)) + ALLOCATE(WORK(LWORK)) + + CALL CGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, CEIGS, Z, LDZ, RES, & + AU, LDAU, W, LDW, S, LDS, & + CWORK, LCWORK, WORK, LWORK, IWORK, LIWORK, INFO ) + IF ( INFO /= 0 ) THEN + WRITE(*,*) 'Call to CGEDMD failed. & + &Check the calling sequence and the code.' + WRITE(*,*) 'The error code is ', INFO + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + STOP + END IF + SINGVX(1:N) = WORK(1:N) + + !...... CGEDMD check point + IF ( LSAME(JOBZ,'V') ) THEN + ! Check that Z = X*W, on return from CGEDMD + ! This checks that the returned eigenvectors in Z are + ! the product of the SVD'POD basis returned in X + ! and the eigenvectors of the Rayleigh quotient + ! returned in W + CALL CGEMM( 'N', 'N', M, K, K, CONE, X, LDX, W, LDW, & + CZERO, Z1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL CAXPY( M, -CONE, Z(1,i), 1, Z1(1,i), 1) + TMP = MAX(TMP, SCNRM2( M, Z1(1,i), 1 ) ) + END DO + TMP_XW = MAX(TMP_XW, TMP ) + IF ( TMP_XW <= TOL ) THEN + !WRITE(*,*) ' :) .... OK .........CGEDMD PASSED.' + ELSE + NFAIL_Z_XV = NFAIL_Z_XV + 1 + WRITE(*,*) ':( .................CGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + END IF + !...... CGEDMD check point + + IF ( LSAME(JOBREF,'R') ) THEN + ! The matrix A*U is returned for computing refined Ritz vectors. + ! Check that A*U is computed correctly using the formula + ! A*U = Y * V * inv(SIGMA). This depends on the + ! accuracy in the computed singular values and vectors of X. + ! See the paper for an error analysis. + ! Note that the left singular vectors of the input matrix X + ! are returned in the array X. + CALL CGEMM( 'N', 'N', M, K, M, CONE, A, LDA, X, LDX, & + CZERO, Z1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL CAXPY( M, -CONE, AU(1,i), 1, Z1(1,i), 1) + TMP = MAX( TMP, SCNRM2( M, Z1(1,i),1 ) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_AU = MAX( TMP_AU, TMP ) + IF ( TMP <= TOL2 ) THEN + !WRITE(*,*) ':) .... OK .........CGEDMD PASSED.' + ELSE + NFAIL_AU = NFAIL_AU + 1 + WRITE(*,*) ':( .................CGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL2 + END IF + ELSEIF ( LSAME(JOBREF,'E') ) THEN + ! The unscaled vectors of the Exact DMD are computed. + ! This option is included for the sake of completeness, + ! for users who prefer the Exact DMD vectors. The + ! returned vectors are in the real form, in the same way + ! as the Ritz vectors. Here we just save the vectors + ! and test them separately using a Matlab script. + CALL CGEMM( 'N', 'N', M, K, M, CONE, A, LDA, AU, LDAU, CZERO, Y1, LDY ) + + DO i=1, K + CALL CAXPY( M, -CEIGS(i), AU(1,i), 1, Y1(1,i), 1 ) + RESEX(i) = SCNRM2( M, Y1(1,i), 1) / SCNRM2(M,AU(1,i),1) + END DO + END IF + !...... CGEDMD check point + + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by CGEDMD with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL CGEMM( 'N', 'N', M, K, M, CONE, A, LDA, Z, LDZ, CZERO, Y1, LDY ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in CGEDMD,) + + DO i=1, K + ! have a real eigenvalue with real eigenvector + CALL CAXPY( M, -CEIGS(i), Z(1,i), 1, Y1(1,i), 1 ) + RES1(i) = SCNRM2( M, Y1(1,i), 1) + END DO + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_REZ = MAX( TMP_REZ, TMP ) + IF ( TMP <= TOL2 ) THEN + !WRITE(*,*) ':) .... OK ..........CGEDMD PASSED.' + ELSE + NFAIL_REZ = NFAIL_REZ + 1 + WRITE(*,*) ':( ..................CGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + + + IF ( LSAME(JOBREF,'E') ) THEN + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) ) + END DO + TMP_EX = MAX(TMP_EX,TMP) + END IF + + END IF + + DEALLOCATE(CWORK) + DEALLOCATE(WORK) + DEALLOCATE(IWORK) + +!....................................................................................................... + + IF ( K_traj == 1 ) THEN + + F(1:M,1:N+1) = F0(1:M,1:N+1) + CALL CGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, & + WHTSVD, M, N+1, F, LDF, X, LDX, Y, LDY, & + NRNK, TOL, K, CEIGS, Z, LDZ, RES, AU, & + LDAU, W, LDW, S, LDS, CDUMMY, -1, & + WDUMMY, -1, IDUMMY, -1, INFO ) + + LCWORK = INT(CDUMMY(LWMINOPT)) + ALLOCATE(CWORK(LCWORK)) + LIWORK = IDUMMY(1) + ALLOCATE(IWORK(LIWORK)) + LWORK = INT(WDUMMY(1)) + ALLOCATE(WORK(LWORK)) + + CALL CGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, & + WHTSVD, M, N+1, F, LDF, X, LDX, Y, LDY, & + NRNK, TOL, KQ, CEIGS, Z, LDZ, RES, AU, & + LDAU, W, LDW, S, LDS, CWORK, LCWORK, & + WORK, LWORK, IWORK, LIWORK, INFO ) + IF ( INFO /= 0 ) THEN + WRITE(*,*) 'Call to CGEDMDQ failed. & + &Check the calling sequence and the code.' + WRITE(*,*) 'The error code is ', INFO + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, WANTQ, WANTR, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + STOP + END IF + SINGVQX(1:N) =WORK(1:N) + + !..... ZGEDMDQ check point + + TMP = ZERO + DO i = 1, MIN(K, KQ) + TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / & + SINGVX(1) ) + END DO + SVDIFF = MAX( SVDIFF, TMP ) + IF ( TMP > TOL2 ) THEN + WRITE(*,*) 'FAILED! Something was wrong with the run.' + NFAIL_SVDIFF = NFAIL_SVDIFF + 1 + END IF + !..... CGEDMDQ check point + + !..... CGEDMDQ check point + IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN + ! Check that the QR factors are computed and returned + ! as requested. The residual ||F-Q*R||_F / ||F||_F + ! is compared to M*N*EPS. + F1(1:M,1:N+1) = F0(1:M,1:N+1) + CALL CGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -CONE, F, & + LDF, Y, LDY, CONE, F1, LDF ) + TMP_FQR = CLANGE( 'F', M, N+1, F1, LDF, WORK ) / & + CLANGE( 'F', M, N+1, F0, LDF, WORK ) + IF ( TMP_FQR <= TOL2 ) THEN + !WRITE(*,*) ':) CGEDMDQ ........ PASSED.' + ELSE + WRITE(*,*) ':( CGEDMDQ ........ FAILED.' + NFAIL_F_QR = NFAIL_F_QR + 1 + END IF + END IF + !..... ZGEDMDQ checkpoint + !..... ZGEDMDQ checkpoint + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by ZGEDMDQ with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL CGEMM( 'N', 'N', M, KQ, M, CONE, A, LDA, Z, LDZ, CZERO, Y1, LDY ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in ZGEDMDQ) + DO i = 1, KQ + ! have a real eigenvalue with real eigenvector + CALL CAXPY( M, -CEIGS(i), Z(1,i), 1, Y1(1,i), 1 ) + ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i) + RES1(i) = SCNRM2( M, Y1(1,i), 1) + END DO + TMP = ZERO + DO i = 1, KQ + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVQX(KQ)/(ANORM*SINGVQX(1)) ) + END DO + TMP_REZQ = MAX( TMP_REZQ, TMP ) + IF ( TMP <= TOL2 ) THEN + !WRITE(*,*) '.... OK ........ CGEDMDQ PASSED.' + ELSE + NFAIL_REZQ = NFAIL_REZQ + 1 + WRITE(*,*) '................ CGEDMDQ FAILED!', & + 'Check the code for implementation errors.' + END IF + END IF + + DEALLOCATE(CWORK) + DEALLOCATE(WORK) + DEALLOCATE(IWORK) + + END IF + + END DO ! LWMINOPT + !write(*,*) 'LWMINOPT loop completed' + END DO ! iWHTSVD + !write(*,*) 'WHTSVD loop completed' + END DO ! iNRNK -2:-1 + !write(*,*) 'NRNK loop completed' + END DO ! iSCALE 1:4 + !write(*,*) 'SCALE loop completed' + END DO + !write(*,*) 'JOBREF loop completed' + END DO ! iJOBZ + !write(*,*) 'JOBZ loop completed' + + END DO ! MODE -6:6 + !write(*,*) 'MODE loop completed' + END DO ! 1 or 2 trajectories + !write(*,*) 'trajectories loop completed' + + DEALLOCATE( A ) + DEALLOCATE( AC ) + DEALLOCATE( Z ) + DEALLOCATE( F ) + DEALLOCATE( F0 ) + DEALLOCATE( F1 ) + DEALLOCATE( X ) + DEALLOCATE( X0 ) + DEALLOCATE( Y ) + DEALLOCATE( Y0 ) + DEALLOCATE( Y1 ) + DEALLOCATE( AU ) + DEALLOCATE( W ) + DEALLOCATE( S ) + DEALLOCATE( Z1 ) + DEALLOCATE( RES ) + DEALLOCATE( RES1 ) + DEALLOCATE( RESEX ) + DEALLOCATE( CEIGS ) + DEALLOCATE( SINGVX ) + DEALLOCATE( SINGVQX ) + + END DO ! LLOOP + + WRITE(*,*) + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for CGEDMD :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + IF ( NFAIL_Z_XV == 0 ) THEN + WRITE(*,*) '>>>> Z - U*V test PASSED.' + ELSE + WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)' + WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_XW + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_z_XV + END IF + + IF ( NFAIL_AU == 0 ) THEN + WRITE(*,*) '>>>> A*U test PASSED. ' + ELSE + WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)' + WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU + END IF + + + IF ( NFAIL_REZ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ + END IF + IF ( NFAIL_TOTAL == 0 ) THEN + WRITE(*,*) '>>>> CGEDMD :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAIL_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>>>>>>>>> CGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + WRITE(*,*) + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for CGEDMDQ :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + + IF ( NFAIL_SVDIFF == 0 ) THEN + WRITE(*,*) '>>>> CGEDMD and CGEDMDQ computed singular & + &values test PASSED.' + ELSE + WRITE(*,*) 'ZGEDMD and ZGEDMDQ discrepancies in & + &the singular values unacceptable ', & + NFAIL_SVDIFF, ' times. Test FAILED.' + WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF + END IF + IF ( NFAIL_F_QR == 0 ) THEN + WRITE(*,*) '>>>> F - Q*R test PASSED.' + ELSE + WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)' + WRITE(*,*) 'The largest relative residual was ', TMP_FQR + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR + END IF + + IF ( NFAIL_REZQ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ + END IF + + IF ( NFAILQ_TOTAL == 0 ) THEN + WRITE(*,*) '>>>>>>> CGEDMDQ :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>> CGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + WRITE(*,*) + WRITE(*,*) 'Test completed.' + STOP + END diff --git a/lapack-netlib/TESTING/EIG/cerrst.f b/lapack-netlib/TESTING/EIG/cerrst.f index 1748a2aad..d23eb14ea 100644 --- a/lapack-netlib/TESTING/EIG/cerrst.f +++ b/lapack-netlib/TESTING/EIG/cerrst.f @@ -748,17 +748,17 @@ CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 2*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) NT = NT + 12 @@ -830,19 +830,19 @@ INFOT = 18 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0, 0.0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0, 0.0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 26*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0, 0.0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) NT = NT + 13 diff --git a/lapack-netlib/TESTING/EIG/chkxer.f b/lapack-netlib/TESTING/EIG/chkxer.f index fd00bb65a..70caf7e0a 100644 --- a/lapack-netlib/TESTING/EIG/chkxer.f +++ b/lapack-netlib/TESTING/EIG/chkxer.f @@ -61,7 +61,7 @@ RETURN * 9999 FORMAT( ' *** Illegal value of parameter number ', I2, - $ ' not detected by ', A6, ' ***' ) + $ ' not detected by ', A, ' ***' ) * * End of CHKXER * diff --git a/lapack-netlib/TESTING/EIG/dchkdmd.f90 b/lapack-netlib/TESTING/EIG/dchkdmd.f90 new file mode 100644 index 000000000..4fbf7531b --- /dev/null +++ b/lapack-netlib/TESTING/EIG/dchkdmd.f90 @@ -0,0 +1,813 @@ +! This is a test program for checking the implementations of +! the implementations of the following subroutines +! +! DGEDMD for computation of the +! Dynamic Mode Decomposition (DMD) +! DGEDMDQ for computation of a +! QR factorization based compressed DMD +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! ======================================================== +! How to run the code (compiler, link info) +! ======================================================== +! Compile as FORTRAN 90 (or later) and link with BLAS and +! LAPACK libraries. +! NOTE: The code is developed and tested on top of the +! Intel MKL library (versions 2022.0.3 and 2022.2.0), +! using the Intel Fortran compiler. +! +! For developers of the C++ implementation +! ======================================================== +! See the LAPACK++ and Template Numerical Toolkit (TNT) +! +! Note on a development of the GPU HP implementation +! ======================================================== +! Work in progress. See CUDA, MAGMA, SLATE. +! NOTE: The four SVD subroutines used in this code are +! included as a part of R&D and for the completeness. +! This was also an opportunity to test those SVD codes. +! If the scaling option is used all four are essentially +! equally good. For implementations on HP platforms, +! one can use whichever SVD is available. +!... ......................................................... +! NOTE: +! When using the Intel MKL 2022.0.3 the subroutine xGESVDQ +! (optionally used in xGEDMD) may cause access violation +! error for x = S, D, C, Z, but only if called with the +! work space query. (At least in our Windows 10 MSVS 2019.) +! The problem can be mitigated by downloading the source +! code of xGESVDQ from the LAPACK repository and use it +! localy instead of the one in the MKL. This seems to +! indicate that the problem is indeed in the MKL. +! This problem did not appear whith Intel MKL 2022.2.0. +! +! NOTE: +! xGESDD seems to have a problem with workspace. In some +! cases the length of the optimal workspace is returned +! smaller than the minimal workspace, as specified in the +! code. As a precaution, all optimal workspaces are +! set as MAX(minimal, optimal). +! Latest implementations of complex xGESDD have different +! length of the real worksapce. We use max value over +! two versions. +!............................................................ +!............................................................ +! + PROGRAM DMD_TEST + use iso_fortran_env, only: real64 + IMPLICIT NONE + integer, parameter :: WP = real64 + +!............................................................ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP +!............................................................ + REAL(KIND=WP), ALLOCATABLE, DIMENSION(:,:) :: & + A, AC, EIGA, LAMBDA, LAMBDAQ, F, F1, F2,& + Z, Z1, S, AU, W, VA, X, X0, Y, Y0, Y1 + REAL(KIND=WP), ALLOCATABLE, DIMENSION(:) :: & + DA, DL, DR, REIG, REIGA, REIGQ, IEIG, & + IEIGA, IEIGQ, RES, RES1, RESEX, SINGVX,& + SINGVQX, WORK + INTEGER , ALLOCATABLE, DIMENSION(:) :: IWORK + REAL(KIND=WP) :: AB(2,2), WDUMMY(2) + INTEGER :: IDUMMY(2), ISEED(4), RJOBDATA(8) + REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, DMAX, EPS, & + TOL, TOL2, SVDIFF, TMP, TMP_AU, & + TMP_FQR, TMP_REZ, TMP_REZQ, TMP_ZXW, & + TMP_EX, XNORM, YNORM +!............................................................ + INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY, & + LDZ, LIWORK, LWORK, M, N, L, LLOOP, NRNK + INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, j, KDIFF, & + NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ, & + NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL, & + NFAIL_Z_XV, MODE, MODEL, MODER, WHTSVD + INTEGER iNRNK, iWHTSVD, K_TRAJ, LWMINOPT + CHARACTER(LEN=1) GRADE, JOBREF, JOBZ, PIVTNG, RSIGN, & + SCALE, RESIDS, WANTQ, WANTR + + LOGICAL TEST_QRDMD +!..... external subroutines (BLAS and LAPACK) + EXTERNAL DAXPY, DGEEV, DGEMM, DGEMV, DLACPY, DLASCL + EXTERNAL DLARNV, DLATMR +!.....external subroutines DMD package, part 1 +! subroutines under test + EXTERNAL DGEDMD, DGEDMDQ + +!..... external functions (BLAS and LAPACK) + EXTERNAL DLAMCH, DLANGE, DNRM2 + REAL(KIND=WP) :: DLAMCH, DLANGE, DNRM2 + EXTERNAL LSAME + LOGICAL LSAME + + INTRINSIC ABS, INT, MIN, MAX +!............................................................ + + ! The test is always in pairs : ( DGEDMD and DGEDMDQ ) + ! because the test includes comparing the results (in pairs). +!..................................................................................... + TEST_QRDMD = .TRUE. ! This code by default performs tests on DGEDMDQ + ! Since the QR factorizations based algorithm is designed for + ! single trajectory data, only single trajectory tests will + ! be performed with xGEDMDQ. + WANTQ = 'Q' + WANTR = 'R' +!................................................................................. + + EPS = DLAMCH( 'P' ) ! machine precision DP + + ! Global counters of failures of some particular tests + NFAIL = 0 + NFAIL_REZ = 0 + NFAIL_REZQ = 0 + NFAIL_Z_XV = 0 + NFAIL_F_QR = 0 + NFAIL_AU = 0 + KDIFF = 0 + NFAIL_SVDIFF = 0 + NFAIL_TOTAL = 0 + NFAILQ_TOTAL = 0 + + + DO LLOOP = 1, 4 + + WRITE(*,*) 'L Loop Index = ', LLOOP + + ! Set the dimensions of the problem ... + WRITE(*,*) 'M = ' + READ(*,*) M + WRITE(*,*) M + ! ... and the number of snapshots. + WRITE(*,*) 'N = ' + READ(*,*) N + WRITE(*,*) N + + ! ... Test the dimensions + IF ( ( MIN(M,N) == 0 ) .OR. ( M < N ) ) THEN + WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.' + STOP + END IF +!............. + ! The seed inside the LLOOP so that each pass can be reproduced easily. + + ISEED(1) = 4 + ISEED(2) = 3 + ISEED(3) = 2 + ISEED(4) = 1 + + LDA = M + LDF = M + LDX = MAX(M,N+1) + LDY = MAX(M,N+1) + LDW = N + LDZ = M + LDAU = MAX(M,N+1) + LDS = N + + TMP_ZXW = ZERO + TMP_AU = ZERO + TMP_REZ = ZERO + TMP_REZQ = ZERO + SVDIFF = ZERO + TMP_EX = ZERO + + ! + ! Test the subroutines on real data snapshots. All + ! computation is done in real arithmetic, even when + ! Koopman eigenvalues and modes are real. + ! + ! Allocate memory space + ALLOCATE( A(LDA,M) ) + ALLOCATE( AC(LDA,M) ) + ALLOCATE( DA(M) ) + ALLOCATE( DL(M) ) + ALLOCATE( F(LDF,N+1) ) + ALLOCATE( F1(LDF,N+1) ) + ALLOCATE( F2(LDF,N+1) ) + ALLOCATE( X(LDX,N) ) + ALLOCATE( X0(LDX,N) ) + ALLOCATE( SINGVX(N) ) + ALLOCATE( SINGVQX(N) ) + ALLOCATE( Y(LDY,N+1) ) + ALLOCATE( Y0(LDY,N+1) ) + ALLOCATE( Y1(M,N+1) ) + ALLOCATE( Z(LDZ,N) ) + ALLOCATE( Z1(LDZ,N) ) + ALLOCATE( RES(N) ) + ALLOCATE( RES1(N) ) + ALLOCATE( RESEX(N) ) + ALLOCATE( REIG(N) ) + ALLOCATE( IEIG(N) ) + ALLOCATE( REIGQ(N) ) + ALLOCATE( IEIGQ(N) ) + ALLOCATE( REIGA(M) ) + ALLOCATE( IEIGA(M) ) + ALLOCATE( VA(LDA,M) ) + ALLOCATE( LAMBDA(N,2) ) + ALLOCATE( LAMBDAQ(N,2) ) + ALLOCATE( EIGA(M,2) ) + ALLOCATE( W(LDW,N) ) + ALLOCATE( AU(LDAU,N) ) + ALLOCATE( S(N,N) ) + + TOL = M*EPS + ! This mimics O(M*N)*EPS bound for accumulated roundoff error. + ! The factor 10 is somewhat arbitrary. + TOL2 = 10*M*N*EPS + +!............. + + DO K_TRAJ = 1, 2 + ! Number of intial conditions in the simulation/trajectories (1 or 2) + + COND = 1.0D8 + DMAX = 1.0D2 + RSIGN = 'F' + GRADE = 'N' + MODEL = 6 + CONDL = 1.0D2 + MODER = 6 + CONDR = 1.0D2 + PIVTNG = 'N' + + ! Loop over all parameter MODE values for ZLATMR (+1,..,+6) + DO MODE = 1, 6 + + ALLOCATE( IWORK(2*M) ) + ALLOCATE(DR(N)) + CALL DLATMR( M, M, 'S', ISEED, 'N', DA, MODE, COND, & + DMAX, RSIGN, GRADE, DL, MODEL, CONDL, & + DR, MODER, CONDR, PIVTNG, IWORK, M, M, & + ZERO, -ONE, 'N', A, LDA, IWORK(M+1), INFO ) + DEALLOCATE(IWORK) + DEALLOCATE(DR) + + LWORK = 4*M+1 + ALLOCATE(WORK(LWORK)) + AC = A + CALL DGEEV( 'N','V', M, AC, M, REIGA, IEIGA, VA, M, & + VA, M, WORK, LWORK, INFO ) ! LAPACK CALL + DEALLOCATE(WORK) + TMP = ZERO + DO i = 1, M + EIGA(i,1) = REIGA(i) + EIGA(i,2) = IEIGA(i) + TMP = MAX( TMP, SQRT(REIGA(i)**2+IEIGA(i)**2)) + END DO + + ! Scale A to have the desirable spectral radius. + CALL DLASCL( 'G', 0, 0, TMP, ONE, M, M, A, M, INFO ) + CALL DLASCL( 'G', 0, 0, TMP, ONE, M, 2, EIGA, M, INFO ) + + ! Compute the norm of A + ANORM = DLANGE( 'F', N, N, A, M, WDUMMY ) + + IF ( K_TRAJ == 2 ) THEN + ! generate data with two inital conditions + CALL DLARNV(2, ISEED, M, F1(1,1) ) + F1(1:M,1) = 1.0E-10*F1(1:M,1) + DO i = 1, N/2 + CALL DGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, & + F1(1,i+1), 1 ) + END DO + X0(1:M,1:N/2) = F1(1:M,1:N/2) + Y0(1:M,1:N/2) = F1(1:M,2:N/2+1) + + CALL DLARNV(2, ISEED, M, F1(1,1) ) + DO i = 1, N-N/2 + CALL DGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, & + F1(1,i+1), 1 ) + END DO + X0(1:M,N/2+1:N) = F1(1:M,1:N-N/2) + Y0(1:M,N/2+1:N) = F1(1:M,2:N-N/2+1) + ELSE + CALL DLARNV(2, ISEED, M, F(1,1) ) + DO i = 1, N + CALL DGEMV( 'N', M, M, ONE, A, M, F(1,i), 1, ZERO, & + F(1,i+1), 1 ) + END DO + X0(1:M,1:N) = F(1:M,1:N) + Y0(1:M,1:N) = F(1:M,2:N+1) + END IF + + XNORM = DLANGE( 'F', M, N, X0, LDX, WDUMMY ) + YNORM = DLANGE( 'F', M, N, Y0, LDX, WDUMMY ) +!............................................................ + + DO iJOBZ = 1, 4 + + SELECT CASE ( iJOBZ ) + CASE(1) + JOBZ = 'V' ! Ritz vectors will be computed + RESIDS = 'R' ! Residuals will be computed + CASE(2) + JOBZ = 'V' + RESIDS = 'N' + CASE(3) + JOBZ = 'F' ! Ritz vectors in factored form + RESIDS = 'N' + CASE(4) + JOBZ = 'N' + RESIDS = 'N' + END SELECT + + DO iJOBREF = 1, 3 + + SELECT CASE ( iJOBREF ) + CASE(1) + JOBREF = 'R' ! Data for refined Ritz vectors + CASE(2) + JOBREF = 'E' ! Exact DMD vectors + CASE(3) + JOBREF = 'N' + END SELECT + + DO iSCALE = 1, 4 + + SELECT CASE ( iSCALE ) + CASE(1) + SCALE = 'S' ! X data normalized + CASE(2) + SCALE = 'C' ! X normalized, consist. check + CASE(3) + SCALE = 'Y' ! Y data normalized + CASE(4) + SCALE = 'N' + END SELECT + + DO iNRNK = -1, -2, -1 + ! Two truncation strategies. The "-2" case for R&D + ! purposes only - it uses possibly low accuracy small + ! singular values, in which case the formulas used in + ! the DMD are highly sensitive. + NRNK = iNRNK + + DO iWHTSVD = 1, 4 + ! Check all four options to compute the POD basis + ! via the SVD. + WHTSVD = iWHTSVD + + DO LWMINOPT = 1, 2 + ! Workspace query for the minimal (1) and for the optimal + ! (2) workspace lengths determined by workspace query. + + X(1:M,1:N) = X0(1:M,1:N) + Y(1:M,1:N) = Y0(1:M,1:N) + + ! DGEDMD: Workspace query and workspace allocation + CALL DGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, & + N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, & + LDZ, RES, AU, LDAU, W, LDW, S, LDS, WDUMMY, -1, & + IDUMMY, -1, INFO ) + + LIWORK = IDUMMY(1) + ALLOCATE( IWORK(LIWORK) ) + LWORK = INT(WDUMMY(LWMINOPT)) + ALLOCATE( WORK(LWORK) ) + + ! DGEDMD test: CALL DGEDMD + CALL DGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, & + N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, & + LDZ, RES, AU, LDAU, W, LDW, S, LDS, WORK, LWORK,& + IWORK, LIWORK, INFO ) + + SINGVX(1:N) = WORK(1:N) + + !...... DGEDMD check point + IF ( LSAME(JOBZ,'V') ) THEN + ! Check that Z = X*W, on return from DGEDMD + ! This checks that the returned aigenvectors in Z are + ! the product of the SVD'POD basis returned in X + ! and the eigenvectors of the rayleigh quotient + ! returned in W + CALL DGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, & + ZERO, Z1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL DAXPY( M, -ONE, Z(1,i), 1, Z1(1,i), 1) + TMP = MAX(TMP, DNRM2( M, Z1(1,i), 1 ) ) + END DO + TMP_ZXW = MAX(TMP_ZXW, TMP ) + + IF ( TMP_ZXW > 10*M*EPS ) THEN + NFAIL_Z_XV = NFAIL_Z_XV + 1 + WRITE(*,*) ':( .................DGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + + END IF + + !...... DGEDMD check point + IF ( LSAME(JOBREF,'R') ) THEN + ! The matrix A*U is returned for computing refined Ritz vectors. + ! Check that A*U is computed correctly using the formula + ! A*U = Y * V * inv(SIGMA). This depends on the + ! accuracy in the computed singular values and vectors of X. + ! See the paper for an error analysis. + ! Note that the left singular vectors of the input matrix X + ! are returned in the array X. + CALL DGEMM( 'N', 'N', M, K, M, ONE, A, LDA, X, LDX, & + ZERO, Z1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL DAXPY( M, -ONE, AU(1,i), 1, Z1(1,i), 1) + TMP = MAX( TMP, DNRM2( M, Z1(1,i),1 ) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_AU = MAX( TMP_AU, TMP ) + + IF ( TMP > TOL2 ) THEN + NFAIL_AU = NFAIL_AU + 1 + WRITE(*,*) ':( .................DGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + + ELSEIF ( LSAME(JOBREF,'E') ) THEN + ! The unscaled vectors of the Exact DMD are computed. + ! This option is included for the sake of completeness, + ! for users who prefer the Exact DMD vectors. The + ! returned vectors are in the real form, in the same way + ! as the Ritz vectors. Here we just save the vectors + ! and test them separately using a Matlab script. + + CALL DGEMM( 'N', 'N', M, K, M, ONE, A, LDA, AU, LDAU, ZERO, Y1, M ) + i=1 + DO WHILE ( i <= K ) + IF ( IEIG(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL DAXPY( M, -REIG(i), AU(1,i), 1, Y1(1,i), 1 ) + RESEX(i) = DNRM2( M, Y1(1,i), 1) / DNRM2(M,AU(1,i),1) + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIG(i) + AB(2,1) = -IEIG(i) + AB(1,2) = IEIG(i) + AB(2,2) = REIG(i) + CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, AU(1,i), & + M, AB, 2, ONE, Y1(1,i), M ) + RESEX(i) = DLANGE( 'F', M, 2, Y1(1,i), M, & + WORK )/ DLANGE( 'F', M, 2, AU(1,i), M, & + WORK ) + RESEX(i+1) = RESEX(i) + i = i + 2 + END IF + END DO + + END IF + + !...... DGEDMD check point + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by DGEDMD with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL DGEMM( 'N', 'N', M, K, M, ONE, A, LDA, Z, LDZ, ZERO, Y1, M ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in DGEDMD,) + i = 1 + DO WHILE ( i <= K ) + IF ( IEIG(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL DAXPY( M, -REIG(i), Z(1,i), 1, Y1(1,i), 1 ) + RES1(i) = DNRM2( M, Y1(1,i), 1) + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIG(i) + AB(2,1) = -IEIG(i) + AB(1,2) = IEIG(i) + AB(2,2) = REIG(i) + CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), & + M, AB, 2, ONE, Y1(1,i), M ) + RES1(i) = DLANGE( 'F', M, 2, Y1(1,i), M, & + WORK ) + RES1(i+1) = RES1(i) + i = i + 2 + END IF + END DO + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_REZ = MAX( TMP_REZ, TMP ) + + IF ( TMP > TOL2 ) THEN + NFAIL_REZ = NFAIL_REZ + 1 + WRITE(*,*) ':( ..................DGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + + IF ( LSAME(JOBREF,'E') ) THEN + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) ) + END DO + TMP_EX = MAX(TMP_EX,TMP) + END IF + + END IF + + !..... store the results for inspection + DO i = 1, K + LAMBDA(i,1) = REIG(i) + LAMBDA(i,2) = IEIG(i) + END DO + + DEALLOCATE(IWORK) + DEALLOCATE(WORK) + + !====================================================================== + ! Now test the DGEDMDQ + !====================================================================== + IF ( TEST_QRDMD .AND. (K_TRAJ == 1) ) THEN + RJOBDATA(2) = 1 + F1 = F + + ! DGEDMDQ test: Workspace query and workspace allocation + CALL DGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, & + JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, & + LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ, & + RES, AU, LDAU, W, LDW, S, LDS, WDUMMY, & + -1, IDUMMY, -1, INFO ) + LIWORK = IDUMMY(1) + ALLOCATE( IWORK(LIWORK) ) + LWORK = INT(WDUMMY(LWMINOPT)) + ALLOCATE(WORK(LWORK)) + ! DGEDMDQ test: CALL DGEDMDQ + CALL DGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, & + JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, & + LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ, & + RES, AU, LDAU, W, LDW, S, LDS, & + WORK, LWORK, IWORK, LIWORK, INFO ) + + SINGVQX(1:KQ) = WORK(MIN(M,N+1)+1: MIN(M,N+1)+KQ) + + !..... DGEDMDQ check point + IF ( KQ /= K ) THEN + KDIFF = KDIFF+1 + END IF + + TMP = ZERO + DO i = 1, MIN(K, KQ) + TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / & + SINGVX(1) ) + END DO + SVDIFF = MAX( SVDIFF, TMP ) + IF ( TMP > M*N*EPS ) THEN + WRITE(*,*) 'FAILED! Something was wrong with the run.' + NFAIL_SVDIFF = NFAIL_SVDIFF + 1 + DO j =1, 3 + write(*,*) j, SINGVX(j), SINGVQX(j) + read(*,*) + END DO + END IF + + !..... DGEDMDQ check point + IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN + ! Check that the QR factors are computed and returned + ! as requested. The residual ||F-Q*R||_F / ||F||_F + ! is compared to M*N*EPS. + F2 = F + CALL DGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -ONE, F1, & + LDF, Y, LDY, ONE, F2, LDF ) + TMP_FQR = DLANGE( 'F', M, N+1, F2, LDF, WORK ) / & + DLANGE( 'F', M, N+1, F, LDF, WORK ) + IF ( TMP_FQR > TOL2 ) THEN + WRITE(*,*) 'FAILED! Something was wrong with the run.' + NFAIL_F_QR = NFAIL_F_QR + 1 + END IF + END IF + + !..... DGEDMDQ check point + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by DGEDMDQ with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL DGEMM( 'N', 'N', M, KQ, M, ONE, A, M, Z, M, ZERO, Y1, M ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in DGEDMDQ) + i = 1 + DO WHILE ( i <= KQ ) + IF ( IEIGQ(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL DAXPY( M, -REIGQ(i), Z(1,i), 1, Y1(1,i), 1 ) + ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i) + RES1(i) = DNRM2( M, Y1(1,i), 1) + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIGQ(i) + AB(2,1) = -IEIGQ(i) + AB(1,2) = IEIGQ(i) + AB(2,2) = REIGQ(i) + CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), & + M, AB, 2, ONE, Y1(1,i), M ) ! BLAS CALL + ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB ! INTRINSIC + RES1(i) = DLANGE( 'F', M, 2, Y1(1,i), M, & + WORK ) ! LAPACK CALL + RES1(i+1) = RES1(i) + i = i + 2 + END IF + END DO + TMP = ZERO + DO i = 1, KQ + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVQX(K)/(ANORM*SINGVQX(1)) ) + END DO + TMP_REZQ = MAX( TMP_REZQ, TMP ) + IF ( TMP > TOL2 ) THEN + NFAIL_REZQ = NFAIL_REZQ + 1 + WRITE(*,*) '................ DGEDMDQ FAILED!', & + 'Check the code for implementation errors.' + STOP + END IF + + END IF + + DO i = 1, KQ + LAMBDAQ(i,1) = REIGQ(i) + LAMBDAQ(i,2) = IEIGQ(i) + END DO + + DEALLOCATE(WORK) + DEALLOCATE(IWORK) + END IF ! TEST_QRDMD +!====================================================================== + + END DO ! LWMINOPT + !write(*,*) 'LWMINOPT loop completed' + END DO ! WHTSVD LOOP + !write(*,*) 'WHTSVD loop completed' + END DO ! NRNK LOOP + !write(*,*) 'NRNK loop completed' + END DO ! SCALE LOOP + !write(*,*) 'SCALE loop completed' + END DO ! JOBF LOOP + !write(*,*) 'JOBREF loop completed' + END DO ! JOBZ LOOP + !write(*,*) 'JOBZ loop completed' + + END DO ! MODE -6:6 + !write(*,*) 'MODE loop completed' + END DO ! 1 or 2 trajectories + !write(*,*) 'trajectories loop completed' + + DEALLOCATE(A) + DEALLOCATE(AC) + DEALLOCATE(DA) + DEALLOCATE(DL) + DEALLOCATE(F) + DEALLOCATE(F1) + DEALLOCATE(F2) + DEALLOCATE(X) + DEALLOCATE(X0) + DEALLOCATE(SINGVX) + DEALLOCATE(SINGVQX) + DEALLOCATE(Y) + DEALLOCATE(Y0) + DEALLOCATE(Y1) + DEALLOCATE(Z) + DEALLOCATE(Z1) + DEALLOCATE(RES) + DEALLOCATE(RES1) + DEALLOCATE(RESEX) + DEALLOCATE(REIG) + DEALLOCATE(IEIG) + DEALLOCATE(REIGQ) + DEALLOCATE(IEIGQ) + DEALLOCATE(REIGA) + DEALLOCATE(IEIGA) + DEALLOCATE(VA) + DEALLOCATE(LAMBDA) + DEALLOCATE(LAMBDAQ) + DEALLOCATE(EIGA) + DEALLOCATE(W) + DEALLOCATE(AU) + DEALLOCATE(S) + +!............................................................ + ! Generate random M-by-M matrix A. Use DLATMR from + END DO ! LLOOP + + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for DGEDMD :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + IF ( NFAIL_Z_XV == 0 ) THEN + WRITE(*,*) '>>>> Z - U*V test PASSED.' + ELSE + WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)' + WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_ZXW + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_Z_XV + END IF + IF ( NFAIL_AU == 0 ) THEN + WRITE(*,*) '>>>> A*U test PASSED. ' + ELSE + WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)' + WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU + END IF + + IF ( NFAIL_REZ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ + END IF + + IF ( NFAIL_TOTAL == 0 ) THEN + WRITE(*,*) '>>>> DGEDMD :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAIL_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>>>>>>>>> DGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + IF ( TEST_QRDMD ) THEN + WRITE(*,*) + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for DGEDMDQ :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + + IF ( NFAIL_SVDIFF == 0 ) THEN + WRITE(*,*) '>>>> DGEDMD and DGEDMDQ computed singular & + &values test PASSED.' + ELSE + WRITE(*,*) 'DGEDMD and DGEDMDQ discrepancies in & + &the singular values unacceptable ', & + NFAIL_SVDIFF, ' times. Test FAILED.' + WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF + END IF + + IF ( NFAIL_F_QR == 0 ) THEN + WRITE(*,*) '>>>> F - Q*R test PASSED.' + ELSE + WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)' + WRITE(*,*) 'The largest relative residual was ', TMP_FQR + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR + END IF + + IF ( NFAIL_REZQ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ + END IF + + IF ( NFAILQ_TOTAL == 0 ) THEN + WRITE(*,*) '>>>>>>> DGEDMDQ :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>> DGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + END IF + + WRITE(*,*) + WRITE(*,*) 'Test completed.' + STOP + END diff --git a/lapack-netlib/TESTING/EIG/derrst.f b/lapack-netlib/TESTING/EIG/derrst.f index 059538644..7d111e2e0 100644 --- a/lapack-netlib/TESTING/EIG/derrst.f +++ b/lapack-netlib/TESTING/EIG/derrst.f @@ -735,12 +735,12 @@ CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL DSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N-1, IW( 2*N+1 ), 10*N, + $ M, R, Z, 1, IW, Q, 0, IW( 2*N+1 ), 10*N, $ INFO ) CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL DSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 0, $ INFO ) CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK ) NT = NT + 11 diff --git a/lapack-netlib/TESTING/EIG/schkdmd.f90 b/lapack-netlib/TESTING/EIG/schkdmd.f90 new file mode 100644 index 000000000..77e3e46c0 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/schkdmd.f90 @@ -0,0 +1,792 @@ +! This is a test program for checking the implementations of +! the implementations of the following subroutines +! +! SGEDMD for computation of the +! Dynamic Mode Decomposition (DMD) +! SGEDMDQ for computation of a +! QR factorization based compressed DMD +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! ======================================================== +! How to run the code (compiler, link info) +! ======================================================== +! Compile as FORTRAN 90 (or later) and link with BLAS and +! LAPACK libraries. +! NOTE: The code is developed and tested on top of the +! Intel MKL library (versions 2022.0.3 and 2022.2.0), +! using the Intel Fortran compiler. +! +! For developers of the C++ implementation +! ======================================================== +! See the LAPACK++ and Template Numerical Toolkit (TNT) +! +! Note on a development of the GPU HP implementation +! ======================================================== +! Work in progress. See CUDA, MAGMA, SLATE. +! NOTE: The four SVD subroutines used in this code are +! included as a part of R&D and for the completeness. +! This was also an opportunity to test those SVD codes. +! If the scaling option is used all four are essentially +! equally good. For implementations on HP platforms, +! one can use whichever SVD is available. +!... ......................................................... +! NOTE: +! When using the Intel MKL 2022.0.3 the subroutine xGESVDQ +! (optionally used in xGEDMD) may cause access violation +! error for x = S, D, C, Z, but only if called with the +! work space query. (At least in our Windows 10 MSVS 2019.) +! The problem can be mitigated by downloading the source +! code of xGESVDQ from the LAPACK repository and use it +! localy instead of the one in the MKL. This seems to +! indicate that the problem is indeed in the MKL. +! This problem did not appear whith Intel MKL 2022.2.0. +! +! NOTE: +! xGESDD seems to have a problem with workspace. In some +! cases the length of the optimal workspace is returned +! smaller than the minimal workspace, as specified in the +! code. As a precaution, all optimal workspaces are +! set as MAX(minimal, optimal). +! Latest implementations of complex xGESDD have different +! length of the real worksapce. We use max value over +! two versions. +!............................................................ +!............................................................ +! + PROGRAM DMD_TEST + use iso_fortran_env, only: real32 + IMPLICIT NONE + integer, parameter :: WP = real32 + +!............................................................ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP +!............................................................ + REAL(KIND=WP), ALLOCATABLE, DIMENSION(:,:) :: & + A, AC, EIGA, LAMBDA, LAMBDAQ, F, F1, F2,& + Z, Z1, S, AU, W, VA, X, X0, Y, Y0, Y1 + REAL(KIND=WP), ALLOCATABLE, DIMENSION(:) :: & + DA, DL, DR, REIG, REIGA, REIGQ, IEIG, & + IEIGA, IEIGQ, RES, RES1, RESEX, SINGVX,& + SINGVQX, WORK + INTEGER , ALLOCATABLE, DIMENSION(:) :: IWORK + REAL(KIND=WP) :: AB(2,2), WDUMMY(2) + INTEGER :: IDUMMY(2), ISEED(4), RJOBDATA(8) + REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, DMAX, EPS, & + TOL, TOL2, SVDIFF, TMP, TMP_AU, & + TMP_FQR, TMP_REZ, TMP_REZQ, TMP_ZXW, & + TMP_EX, XNORM, YNORM +!............................................................ + INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY, & + LDZ, LIWORK, LWORK, M, N, L, LLOOP, NRNK + INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, KDIFF, & + NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ, & + NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL, & + NFAIL_Z_XV, MODE, MODEL, MODER, WHTSVD + INTEGER iNRNK, iWHTSVD, K_TRAJ, LWMINOPT + CHARACTER(LEN=1) GRADE, JOBREF, JOBZ, PIVTNG, RSIGN, & + SCALE, RESIDS, WANTQ, WANTR + + LOGICAL TEST_QRDMD +!..... external subroutines (BLAS and LAPACK) + EXTERNAL SAXPY, SGEEV, SGEMM, SGEMV, SLACPY, SLASCL + EXTERNAL SLARNV, SLATMR +!.....external subroutines DMD package, part 1 +! subroutines under test + EXTERNAL SGEDMD, SGEDMDQ + +!..... external functions (BLAS and LAPACK) + EXTERNAL SLAMCH, SLANGE, SNRM2 + REAL(KIND=WP) :: SLAMCH, SLANGE, SNRM2 + EXTERNAL LSAME + LOGICAL LSAME + + INTRINSIC ABS, INT, MIN, MAX +!............................................................ + + ! The test is always in pairs : ( SGEDMD and SGEDMDQ ) + ! because the test includes comparing the results (in pairs). +!..................................................................................... + TEST_QRDMD = .TRUE. ! This code by default performs tests on SGEDMDQ + ! Since the QR factorizations based algorithm is designed for + ! single trajectory data, only single trajectory tests will + ! be performed with xGEDMDQ. + WANTQ = 'Q' + WANTR = 'R' +!................................................................................. + + EPS = SLAMCH( 'P' ) ! machine precision SP + + ! Global counters of failures of some particular tests + NFAIL = 0 + NFAIL_REZ = 0 + NFAIL_REZQ = 0 + NFAIL_Z_XV = 0 + NFAIL_F_QR = 0 + NFAIL_AU = 0 + KDIFF = 0 + NFAIL_SVDIFF = 0 + NFAIL_TOTAL = 0 + NFAILQ_TOTAL = 0 + + + DO LLOOP = 1, 4 + + WRITE(*,*) 'L Loop Index = ', LLOOP + + ! Set the dimensions of the problem ... + WRITE(*,*) 'M = ' + READ(*,*) M + WRITE(*,*) M + ! ... and the number of snapshots. + WRITE(*,*) 'N = ' + READ(*,*) N + WRITE(*,*) N + + ! ... Test the dimensions + IF ( ( MIN(M,N) == 0 ) .OR. ( M < N ) ) THEN + WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.' + STOP + END IF +!............. + ! The seed inside the LLOOP so that each pass can be reproduced easily. + + ISEED(1) = 4 + ISEED(2) = 3 + ISEED(3) = 2 + ISEED(4) = 1 + + LDA = M + LDF = M + LDX = MAX(M,N+1) + LDY = MAX(M,N+1) + LDW = N + LDZ = M + LDAU = MAX(M,N+1) + LDS = N + + TMP_ZXW = ZERO + TMP_AU = ZERO + TMP_REZ = ZERO + TMP_REZQ = ZERO + SVDIFF = ZERO + TMP_EX = ZERO + + ! + ! Test the subroutines on real data snapshots. All + ! computation is done in real arithmetic, even when + ! Koopman eigenvalues and modes are real. + ! + ! Allocate memory space + ALLOCATE( A(LDA,M) ) + ALLOCATE( AC(LDA,M) ) + ALLOCATE( DA(M) ) + ALLOCATE( DL(M) ) + ALLOCATE( F(LDF,N+1) ) + ALLOCATE( F1(LDF,N+1) ) + ALLOCATE( F2(LDF,N+1) ) + ALLOCATE( X(LDX,N) ) + ALLOCATE( X0(LDX,N) ) + ALLOCATE( SINGVX(N) ) + ALLOCATE( SINGVQX(N) ) + ALLOCATE( Y(LDY,N+1) ) + ALLOCATE( Y0(LDY,N+1) ) + ALLOCATE( Y1(M,N+1) ) + ALLOCATE( Z(LDZ,N) ) + ALLOCATE( Z1(LDZ,N) ) + ALLOCATE( RES(N) ) + ALLOCATE( RES1(N) ) + ALLOCATE( RESEX(N) ) + ALLOCATE( REIG(N) ) + ALLOCATE( IEIG(N) ) + ALLOCATE( REIGQ(N) ) + ALLOCATE( IEIGQ(N) ) + ALLOCATE( REIGA(M) ) + ALLOCATE( IEIGA(M) ) + ALLOCATE( VA(LDA,M) ) + ALLOCATE( LAMBDA(N,2) ) + ALLOCATE( LAMBDAQ(N,2) ) + ALLOCATE( EIGA(M,2) ) + ALLOCATE( W(LDW,N) ) + ALLOCATE( AU(LDAU,N) ) + ALLOCATE( S(N,N) ) + + TOL = M*EPS + ! This mimics O(M*N)*EPS bound for accumulated roundoff error. + ! The factor 10 is somewhat arbitrary. + TOL2 = 10*M*N*EPS + +!............. + + DO K_TRAJ = 1, 2 + ! Number of intial conditions in the simulation/trajectories (1 or 2) + + COND = 1.0D8 + DMAX = 1.0D2 + RSIGN = 'F' + GRADE = 'N' + MODEL = 6 + CONDL = 1.0D2 + MODER = 6 + CONDR = 1.0D2 + PIVTNG = 'N' + + ! Loop over all parameter MODE values for ZLATMR (+1,..,+6) + DO MODE = 1, 6 + + ALLOCATE( IWORK(2*M) ) + ALLOCATE(DR(N)) + CALL SLATMR( M, M, 'S', ISEED, 'N', DA, MODE, COND, & + DMAX, RSIGN, GRADE, DL, MODEL, CONDL, & + DR, MODER, CONDR, PIVTNG, IWORK, M, M, & + ZERO, -ONE, 'N', A, LDA, IWORK(M+1), INFO ) + DEALLOCATE(IWORK) + DEALLOCATE(DR) + + LWORK = 4*M+1 + ALLOCATE(WORK(LWORK)) + AC = A + CALL SGEEV( 'N','V', M, AC, M, REIGA, IEIGA, VA, M, & + VA, M, WORK, LWORK, INFO ) ! LAPACK CALL + DEALLOCATE(WORK) + TMP = ZERO + DO i = 1, M + EIGA(i,1) = REIGA(i) + EIGA(i,2) = IEIGA(i) + TMP = MAX( TMP, SQRT(REIGA(i)**2+IEIGA(i)**2)) + END DO + + ! Scale A to have the desirable spectral radius. + CALL SLASCL( 'G', 0, 0, TMP, ONE, M, M, A, M, INFO ) + CALL SLASCL( 'G', 0, 0, TMP, ONE, M, 2, EIGA, M, INFO ) + + ! Compute the norm of A + ANORM = SLANGE( 'F', N, N, A, M, WDUMMY ) + + IF ( K_TRAJ == 2 ) THEN + ! generate data with two inital conditions + CALL SLARNV(2, ISEED, M, F1(1,1) ) + F1(1:M,1) = 1.0E-10*F1(1:M,1) + DO i = 1, N/2 + CALL SGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, & + F1(1,i+1), 1 ) + END DO + X0(1:M,1:N/2) = F1(1:M,1:N/2) + Y0(1:M,1:N/2) = F1(1:M,2:N/2+1) + + CALL SLARNV(2, ISEED, M, F1(1,1) ) + DO i = 1, N-N/2 + CALL SGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, & + F1(1,i+1), 1 ) + END DO + X0(1:M,N/2+1:N) = F1(1:M,1:N-N/2) + Y0(1:M,N/2+1:N) = F1(1:M,2:N-N/2+1) + ELSE + ! single trajectory + CALL SLARNV(2, ISEED, M, F(1,1) ) + DO i = 1, N + CALL SGEMV( 'N', M, M, ONE, A, M, F(1,i), 1, ZERO, & + F(1,i+1), 1 ) + END DO + X0(1:M,1:N) = F(1:M,1:N) + Y0(1:M,1:N) = F(1:M,2:N+1) + END IF + + XNORM = SLANGE( 'F', M, N, X0, LDX, WDUMMY ) + YNORM = SLANGE( 'F', M, N, Y0, LDX, WDUMMY ) +!............................................................ + + DO iJOBZ = 1, 4 + + SELECT CASE ( iJOBZ ) + CASE(1) + JOBZ = 'V' ! Ritz vectors will be computed + RESIDS = 'R' ! Residuals will be computed + CASE(2) + JOBZ = 'V' + RESIDS = 'N' + CASE(3) + JOBZ = 'F' ! Ritz vectors in factored form + RESIDS = 'N' + CASE(4) + JOBZ = 'N' + RESIDS = 'N' + END SELECT + + DO iJOBREF = 1, 3 + + SELECT CASE ( iJOBREF ) + CASE(1) + JOBREF = 'R' ! Data for refined Ritz vectors + CASE(2) + JOBREF = 'E' ! Exact DMD vectors + CASE(3) + JOBREF = 'N' + END SELECT + + DO iSCALE = 1, 4 + + SELECT CASE ( iSCALE ) + CASE(1) + SCALE = 'S' ! X data normalized + CASE(2) + SCALE = 'C' ! X normalized, consist. check + CASE(3) + SCALE = 'Y' ! Y data normalized + CASE(4) + SCALE = 'N' + END SELECT + + DO iNRNK = -1, -2, -1 + ! Two truncation strategies. The "-2" case for R&D + ! purposes only - it uses possibly low accuracy small + ! singular values, in which case the formulas used in + ! the DMD are highly sensitive. + NRNK = iNRNK + + DO iWHTSVD = 1, 4 + ! Check all four options to compute the POD basis + ! via the SVD. + WHTSVD = iWHTSVD + + DO LWMINOPT = 1, 2 + ! Workspace query for the minimal (1) and for the optimal + ! (2) workspace lengths determined by workspace query. + + X(1:M,1:N) = X0(1:M,1:N) + Y(1:M,1:N) = Y0(1:M,1:N) + + ! SGEDMD: Workspace query and workspace allocation + CALL SGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, & + N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, & + LDZ, RES, AU, LDAU, W, LDW, S, LDS, WDUMMY, -1, & + IDUMMY, -1, INFO ) + + LIWORK = IDUMMY(1) + ALLOCATE( IWORK(LIWORK) ) + LWORK = INT(WDUMMY(LWMINOPT)) + ALLOCATE( WORK(LWORK) ) + + ! SGEDMD test: CALL SGEDMD + CALL SGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, & + N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, & + LDZ, RES, AU, LDAU, W, LDW, S, LDS, WORK, LWORK,& + IWORK, LIWORK, INFO ) + + SINGVX(1:N) = WORK(1:N) + + !...... SGEDMD check point + IF ( LSAME(JOBZ,'V') ) THEN + ! Check that Z = X*W, on return from SGEDMD + ! This checks that the returned aigenvectors in Z are + ! the product of the SVD'POD basis returned in X + ! and the eigenvectors of the rayleigh quotient + ! returned in W + CALL SGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, & + ZERO, Z1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL SAXPY( M, -ONE, Z(1,i), 1, Z1(1,i), 1) + TMP = MAX(TMP, SNRM2( M, Z1(1,i), 1 ) ) + END DO + TMP_ZXW = MAX(TMP_ZXW, TMP ) + + IF ( TMP_ZXW > 10*M*EPS ) THEN + NFAIL_Z_XV = NFAIL_Z_XV + 1 + END IF + + END IF + + !...... SGEDMD check point + IF ( LSAME(JOBREF,'R') ) THEN + ! The matrix A*U is returned for computing refined Ritz vectors. + ! Check that A*U is computed correctly using the formula + ! A*U = Y * V * inv(SIGMA). This depends on the + ! accuracy in the computed singular values and vectors of X. + ! See the paper for an error analysis. + ! Note that the left singular vectors of the input matrix X + ! are returned in the array X. + CALL SGEMM( 'N', 'N', M, K, M, ONE, A, LDA, X, LDX, & + ZERO, Z1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL SAXPY( M, -ONE, AU(1,i), 1, Z1(1,i), 1) + TMP = MAX( TMP, SNRM2( M, Z1(1,i),1 ) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_AU = MAX( TMP_AU, TMP ) + + IF ( TMP > TOL2 ) THEN + NFAIL_AU = NFAIL_AU + 1 + END IF + + ELSEIF ( LSAME(JOBREF,'E') ) THEN + ! The unscaled vectors of the Exact DMD are computed. + ! This option is included for the sake of completeness, + ! for users who prefer the Exact DMD vectors. The + ! returned vectors are in the real form, in the same way + ! as the Ritz vectors. Here we just save the vectors + ! and test them separately using a Matlab script. + + CALL SGEMM( 'N', 'N', M, K, M, ONE, A, LDA, AU, LDAU, ZERO, Y1, M ) + i=1 + DO WHILE ( i <= K ) + IF ( IEIG(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL SAXPY( M, -REIG(i), AU(1,i), 1, Y1(1,i), 1 ) + RESEX(i) = SNRM2( M, Y1(1,i), 1) / SNRM2(M,AU(1,i),1) + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIG(i) + AB(2,1) = -IEIG(i) + AB(1,2) = IEIG(i) + AB(2,2) = REIG(i) + CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, AU(1,i), & + M, AB, 2, ONE, Y1(1,i), M ) + RESEX(i) = SLANGE( 'F', M, 2, Y1(1,i), M, & + WORK )/ SLANGE( 'F', M, 2, AU(1,i), M, & + WORK ) + RESEX(i+1) = RESEX(i) + i = i + 2 + END IF + END DO + + END IF + + !...... SGEDMD check point + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by SGEDMD with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL SGEMM( 'N', 'N', M, K, M, ONE, A, LDA, Z, LDZ, ZERO, Y1, M ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in SGEDMD,) + i = 1 + DO WHILE ( i <= K ) + IF ( IEIG(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL SAXPY( M, -REIG(i), Z(1,i), 1, Y1(1,i), 1 ) + RES1(i) = SNRM2( M, Y1(1,i), 1) + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIG(i) + AB(2,1) = -IEIG(i) + AB(1,2) = IEIG(i) + AB(2,2) = REIG(i) + CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), & + M, AB, 2, ONE, Y1(1,i), M ) + RES1(i) = SLANGE( 'F', M, 2, Y1(1,i), M, & + WORK ) + RES1(i+1) = RES1(i) + i = i + 2 + END IF + END DO + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_REZ = MAX( TMP_REZ, TMP ) + + IF ( TMP > TOL2 ) THEN + NFAIL_REZ = NFAIL_REZ + 1 + END IF + + IF ( LSAME(JOBREF,'E') ) THEN + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) ) + END DO + TMP_EX = MAX(TMP_EX,TMP) + END IF + + END IF + + ! ... store the results for inspection + DO i = 1, K + LAMBDA(i,1) = REIG(i) + LAMBDA(i,2) = IEIG(i) + END DO + + DEALLOCATE(IWORK) + DEALLOCATE(WORK) + + !====================================================================== + ! Now test the SGEDMDQ, if requested. + !====================================================================== + IF ( TEST_QRDMD .AND. (K_TRAJ == 1) ) THEN + RJOBDATA(2) = 1 + F1 = F + + ! SGEDMDQ test: Workspace query and workspace allocation + CALL SGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, & + JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, & + LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ, & + RES, AU, LDAU, W, LDW, S, LDS, WDUMMY, & + -1, IDUMMY, -1, INFO ) + LIWORK = IDUMMY(1) + ALLOCATE( IWORK(LIWORK) ) + LWORK = INT(WDUMMY(LWMINOPT)) + ALLOCATE(WORK(LWORK)) + + ! SGEDMDQ test: CALL SGEDMDQ + CALL SGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, & + JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, & + LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ, & + RES, AU, LDAU, W, LDW, S, LDS, & + WORK, LWORK, IWORK, LIWORK, INFO ) + + SINGVQX(1:KQ) = WORK(MIN(M,N+1)+1: MIN(M,N+1)+KQ) + + !..... SGEDMDQ check point + IF ( KQ /= K ) THEN + KDIFF = KDIFF+1 + END IF + + TMP = ZERO + DO i = 1, MIN(K, KQ) + TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / & + SINGVX(1) ) + END DO + SVDIFF = MAX( SVDIFF, TMP ) + IF ( TMP > M*N*EPS ) THEN + NFAIL_SVDIFF = NFAIL_SVDIFF + 1 + END IF + + !..... SGEDMDQ check point + IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN + ! Check that the QR factors are computed and returned + ! as requested. The residual ||F-Q*R||_F / ||F||_F + ! is compared to M*N*EPS. + F2 = F + CALL SGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -ONE, F1, & + LDF, Y, LDY, ONE, F2, LDF ) + TMP_FQR = SLANGE( 'F', M, N+1, F2, LDF, WORK ) / & + SLANGE( 'F', M, N+1, F, LDF, WORK ) + IF ( TMP_FQR > TOL2 ) THEN + NFAIL_F_QR = NFAIL_F_QR + 1 + END IF + END IF + + !..... SGEDMDQ checkpoint + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by SGEDMDQ with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL SGEMM( 'N', 'N', M, KQ, M, ONE, A, M, Z, M, ZERO, Y1, M ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in SGEDMDQ) + i = 1 + DO WHILE ( i <= KQ ) + IF ( IEIGQ(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL SAXPY( M, -REIGQ(i), Z(1,i), 1, Y1(1,i), 1 ) + ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i) + RES1(i) = SNRM2( M, Y1(1,i), 1) + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIGQ(i) + AB(2,1) = -IEIGQ(i) + AB(1,2) = IEIGQ(i) + AB(2,2) = REIGQ(i) + CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), & + M, AB, 2, ONE, Y1(1,i), M ) ! BLAS CALL + ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB ! INTRINSIC + RES1(i) = SLANGE( 'F', M, 2, Y1(1,i), M, & + WORK ) ! LAPACK CALL + RES1(i+1) = RES1(i) + i = i + 2 + END IF + END DO + TMP = ZERO + DO i = 1, KQ + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVQX(K)/(ANORM*SINGVQX(1)) ) + END DO + TMP_REZQ = MAX( TMP_REZQ, TMP ) + IF ( TMP > TOL2 ) THEN + NFAIL_REZQ = NFAIL_REZQ + 1 + END IF + + END IF + + DO i = 1, KQ + LAMBDAQ(i,1) = REIGQ(i) + LAMBDAQ(i,2) = IEIGQ(i) + END DO + + DEALLOCATE(WORK) + DEALLOCATE(IWORK) + END IF ! TEST_QRDMD +!====================================================================== + + END DO ! LWMINOPT + !write(*,*) 'LWMINOPT loop completed' + END DO ! WHTSVD LOOP + !write(*,*) 'WHTSVD loop completed' + END DO ! NRNK LOOP + !write(*,*) 'NRNK loop completed' + END DO ! SCALE LOOP + !write(*,*) 'SCALE loop completed' + END DO ! JOBF LOOP + !write(*,*) 'JOBREF loop completed' + END DO ! JOBZ LOOP + !write(*,*) 'JOBZ loop completed' + + END DO ! MODE -6:6 + !write(*,*) 'MODE loop completed' + END DO ! 1 or 2 trajectories + !write(*,*) 'trajectories loop completed' + + DEALLOCATE(A) + DEALLOCATE(AC) + DEALLOCATE(DA) + DEALLOCATE(DL) + DEALLOCATE(F) + DEALLOCATE(F1) + DEALLOCATE(F2) + DEALLOCATE(X) + DEALLOCATE(X0) + DEALLOCATE(SINGVX) + DEALLOCATE(SINGVQX) + DEALLOCATE(Y) + DEALLOCATE(Y0) + DEALLOCATE(Y1) + DEALLOCATE(Z) + DEALLOCATE(Z1) + DEALLOCATE(RES) + DEALLOCATE(RES1) + DEALLOCATE(RESEX) + DEALLOCATE(REIG) + DEALLOCATE(IEIG) + DEALLOCATE(REIGQ) + DEALLOCATE(IEIGQ) + DEALLOCATE(REIGA) + DEALLOCATE(IEIGA) + DEALLOCATE(VA) + DEALLOCATE(LAMBDA) + DEALLOCATE(LAMBDAQ) + DEALLOCATE(EIGA) + DEALLOCATE(W) + DEALLOCATE(AU) + DEALLOCATE(S) + +!............................................................ + ! Generate random M-by-M matrix A. Use DLATMR from + END DO ! LLOOP + + + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for SGEDMD :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + IF ( NFAIL_Z_XV == 0 ) THEN + WRITE(*,*) '>>>> Z - U*V test PASSED.' + ELSE + WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)' + WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_ZXW + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_Z_XV + END IF + IF ( NFAIL_AU == 0 ) THEN + WRITE(*,*) '>>>> A*U test PASSED. ' + ELSE + WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)' + WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU + END IF + + IF ( NFAIL_REZ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ + END IF + + IF ( NFAIL_TOTAL == 0 ) THEN + WRITE(*,*) '>>>> SGEDMD :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAIL_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>>>>>>>>> SGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + IF ( TEST_QRDMD ) THEN + WRITE(*,*) + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for SGEDMDQ :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + + IF ( NFAIL_SVDIFF == 0 ) THEN + WRITE(*,*) '>>>> SGEDMD and SGEDMDQ computed singular & + &values test PASSED.' + ELSE + WRITE(*,*) 'SGEDMD and SGEDMDQ discrepancies in & + &the singular values unacceptable ', & + NFAIL_SVDIFF, ' times. Test FAILED.' + WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF + END IF + + IF ( NFAIL_F_QR == 0 ) THEN + WRITE(*,*) '>>>> F - Q*R test PASSED.' + ELSE + WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)' + WRITE(*,*) 'The largest relative residual was ', TMP_FQR + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR + END IF + + IF ( NFAIL_REZQ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ + END IF + + IF ( NFAILQ_TOTAL == 0 ) THEN + WRITE(*,*) '>>>>>>> SGEDMDQ :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>> SGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + END IF + + WRITE(*,*) + WRITE(*,*) 'Test completed.' + STOP + END diff --git a/lapack-netlib/TESTING/EIG/serrst.f b/lapack-netlib/TESTING/EIG/serrst.f index b87fc42ef..408346382 100644 --- a/lapack-netlib/TESTING/EIG/serrst.f +++ b/lapack-netlib/TESTING/EIG/serrst.f @@ -733,12 +733,12 @@ CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL SSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 26*N-1, IW( 2*N+1 ), 10*N, + $ M, R, Z, 1, IW, Q, 0, IW( 2*N+1 ), 10*N, $ INFO ) CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL SSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 0, $ INFO ) CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK ) NT = NT + 11 diff --git a/lapack-netlib/TESTING/EIG/zchkdmd.f90 b/lapack-netlib/TESTING/EIG/zchkdmd.f90 new file mode 100644 index 000000000..873d956c4 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/zchkdmd.f90 @@ -0,0 +1,745 @@ +! This is a test program for checking the implementations of +! the implementations of the following subroutines +! +! ZGEDMD, for computation of the +! Dynamic Mode Decomposition (DMD) +! ZGEDMDQ, for computation of a +! QR factorization based compressed DMD +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! ======================================================== +! How to run the code (compiler, link info) +! ======================================================== +! Compile as FORTRAN 90 (or later) and link with BLAS and +! LAPACK libraries. +! NOTE: The code is developed and tested on top of the +! Intel MKL library (versions 2022.0.3 and 2022.2.0), +! using the Intel Fortran compiler. +! +! For developers of the C++ implementation +! ======================================================== +! See the LAPACK++ and Template Numerical Toolkit (TNT) +! +! Note on a development of the GPU HP implementation +! ======================================================== +! Work in progress. See CUDA, MAGMA, SLATE. +! NOTE: The four SVD subroutines used in this code are +! included as a part of R&D and for the completeness. +! This was also an opportunity to test those SVD codes. +! If the scaling option is used all four are essentially +! equally good. For implementations on HP platforms, +! one can use whichever SVD is available. +!............................................................ + +!............................................................ +!............................................................ +! + PROGRAM DMD_TEST + use iso_fortran_env, only: real64 + IMPLICIT NONE + integer, parameter :: WP = real64 + +!............................................................ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP + + COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) + COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) +!............................................................ + REAL(KIND=WP), ALLOCATABLE, DIMENSION(:) :: RES, & + RES1, RESEX, SINGVX, SINGVQX, WORK + INTEGER , ALLOCATABLE, DIMENSION(:) :: IWORK + REAL(KIND=WP) :: WDUMMY(2) + INTEGER :: IDUMMY(4), ISEED(4) + REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, EPS, & + TOL, TOL2, SVDIFF, TMP, TMP_AU, & + TMP_FQR, TMP_REZ, TMP_REZQ, TMP_ZXW, & + TMP_EX + +!............................................................ + COMPLEX(KIND=WP) :: ZMAX + INTEGER :: LZWORK + COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:,:) :: ZA, ZAC, & + ZAU, ZF, ZF0, ZF1, ZS, ZW, & + ZX, ZX0, ZY, ZY0, ZY1, ZZ, ZZ1 + COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:) :: ZDA, ZDR, & + ZDL, ZEIGS, ZEIGSA, ZWORK + COMPLEX(KIND=WP) :: ZDUMMY(22), ZDUM2X2(2,2) +!............................................................ + INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY, & + LDZ, LIWORK, LWORK, M, N, LLOOP, NRNK, NRNKsp + INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, j, & + NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ, & + NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL, & + NFAIL_Z_XV, MODE, MODEL, MODER, WHTSVD, & + WHTSVDsp + INTEGER :: iNRNK, iWHTSVD, K_TRAJ, LWMINOPT + CHARACTER :: GRADE, JOBREF, JOBZ, PIVTNG, RSIGN, & + SCALE, RESIDS, WANTQ, WANTR + LOGICAL :: TEST_QRDMD + +!.....external subroutines (BLAS and LAPACK) + EXTERNAL DAXPY, DGEEV, DGEMM, DGEMV, DLACPY, DLASCL + EXTERNAL ZGEEV, ZGEMV, ZLASCL + EXTERNAL ZLARNV, ZLATMR + EXTERNAL ZAXPY, ZGEMM +!.....external subroutines DMD package, part 1 +! subroutines under test + EXTERNAL ZGEDMD, ZGEDMDQ +!.....external functions (BLAS and LAPACK) + EXTERNAL DLAMCH, DZNRM2 + REAL(KIND=WP) :: DLAMCH, DZNRM2 + REAL(KIND=WP) :: ZLANGE + EXTERNAL IZAMAX + INTEGER IZAMAX + EXTERNAL LSAME + LOGICAL LSAME + + INTRINSIC ABS, INT, MIN, MAX, SIGN +!............................................................ + + ! The test is always in pairs : ( ZGEDMD and ZGEDMDQ ) + ! because the test includes comparing the results (in pairs). +!..................................................................................... + TEST_QRDMD = .TRUE. ! This code by default performs tests on ZGEDMDQ + ! Since the QR factorizations based algorithm is designed for + ! single trajectory data, only single trajectory tests will + ! be performed with xGEDMDQ. + WANTQ = 'Q' + WANTR = 'R' +!................................................................................. + + EPS = DLAMCH( 'P' ) ! machine precision DP + + ! Global counters of failures of some particular tests + NFAIL = 0 + NFAIL_REZ = 0 + NFAIL_REZQ = 0 + NFAIL_Z_XV = 0 + NFAIL_F_QR = 0 + NFAIL_AU = 0 + NFAIL_SVDIFF = 0 + NFAIL_TOTAL = 0 + NFAILQ_TOTAL = 0 + + DO LLOOP = 1, 4 + + WRITE(*,*) 'L Loop Index = ', LLOOP + + ! Set the dimensions of the problem ... + WRITE(*,*) 'M = ' + READ(*,*) M + WRITE(*,*) M + ! ... and the number of snapshots. + WRITE(*,*) 'N = ' + READ(*,*) N + WRITE(*,*) N + + ! ... Test the dimensions + IF ( ( MIN(M,N) == 0 ) .OR. ( M < N ) ) THEN + WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.' + STOP + END IF +!............. + ! The seed inside the LLOOP so that each pass can be reproduced easily. + ISEED(1) = 4 + ISEED(2) = 3 + ISEED(3) = 2 + ISEED(4) = 1 + + LDA = M + LDF = M + LDX = M + LDY = M + LDW = N + LDZ = M + LDAU = M + LDS = N + + TMP_ZXW = ZERO + TMP_AU = ZERO + TMP_REZ = ZERO + TMP_REZQ = ZERO + SVDIFF = ZERO + TMP_EX = ZERO + + ALLOCATE( ZA(LDA,M) ) + ALLOCATE( ZAC(LDA,M) ) + ALLOCATE( ZF(LDF,N+1) ) + ALLOCATE( ZF0(LDF,N+1) ) + ALLOCATE( ZF1(LDF,N+1) ) + ALLOCATE( ZX(LDX,N) ) + ALLOCATE( ZX0(LDX,N) ) + ALLOCATE( ZY(LDY,N+1) ) + ALLOCATE( ZY0(LDY,N+1) ) + ALLOCATE( ZY1(LDY,N+1) ) + ALLOCATE( ZAU(LDAU,N) ) + ALLOCATE( ZW(LDW,N) ) + ALLOCATE( ZS(LDS,N) ) + ALLOCATE( ZZ(LDZ,N) ) + ALLOCATE( ZZ1(LDZ,N) ) + ALLOCATE( RES(N) ) + ALLOCATE( RES1(N) ) + ALLOCATE( RESEX(N) ) + ALLOCATE( ZEIGS(N) ) + ALLOCATE( SINGVX(N) ) + ALLOCATE( SINGVQX(N) ) + + TOL = M*EPS + ! This mimics O(M*N)*EPS bound for accumulated roundoff error. + ! The factor 10 is somewhat arbitrary. + TOL2 = 10*M*N*EPS + +!............. + + DO K_TRAJ = 1, 2 + ! Number of intial conditions in the simulation/trajectories (1 or 2) + + COND = 1.0D4 + ZMAX = (1.0D1,1.0D1) + RSIGN = 'F' + GRADE = 'N' + MODEL = 6 + CONDL = 1.0D1 + MODER = 6 + CONDR = 1.0D1 + PIVTNG = 'N' + + ! Loop over all parameter MODE values for ZLATMR (+1,..,+6) + DO MODE = 1, 6 + + ALLOCATE( IWORK(2*M) ) + ALLOCATE( ZDA(M) ) + ALLOCATE( ZDL(M) ) + ALLOCATE( ZDR(M) ) + + CALL ZLATMR( M, M, 'N', ISEED, 'N', ZDA, MODE, COND, & + ZMAX, RSIGN, GRADE, ZDL, MODEL, CONDL, & + ZDR, MODER, CONDR, PIVTNG, IWORK, M, M, & + ZERO, -ONE, 'N', ZA, LDA, IWORK(M+1), INFO ) + DEALLOCATE( ZDR ) + DEALLOCATE( ZDL ) + DEALLOCATE( ZDA ) + DEALLOCATE( IWORK ) + + LZWORK = MAX(1,2*M) + ALLOCATE( ZEIGSA(M) ) + ALLOCATE( ZWORK(LZWORK) ) + ALLOCATE( WORK(2*M) ) + ZAC(1:M,1:M) = ZA(1:M,1:M) + CALL ZGEEV( 'N','N', M, ZAC, LDA, ZEIGSA, ZDUM2X2, 2, & + ZDUM2X2, 2, ZWORK, LZWORK, WORK, INFO ) ! LAPACK CALL + DEALLOCATE(WORK) + DEALLOCATE(ZWORK) + + TMP = ABS(ZEIGSA(IZAMAX(M, ZEIGSA, 1))) ! The spectral radius of ZA + ! Scale the matrix ZA to have unit spectral radius. + CALL ZLASCL( 'G',0, 0, TMP, ONE, M, M, & + ZA, LDA, INFO ) + CALL ZLASCL( 'G',0, 0, TMP, ONE, M, 1, & + ZEIGSA, M, INFO ) + ANORM = ZLANGE( 'F', M, M, ZA, LDA, WDUMMY ) + + IF ( K_TRAJ == 2 ) THEN + ! generate data as two trajectories + ! with two inital conditions + CALL ZLARNV(2, ISEED, M, ZF(1,1) ) + DO i = 1, N/2 + CALL ZGEMV( 'N', M, M, ZONE, ZA, LDA, ZF(1,i), 1, & + ZZERO, ZF(1,i+1), 1 ) + END DO + ZX0(1:M,1:N/2) = ZF(1:M,1:N/2) + ZY0(1:M,1:N/2) = ZF(1:M,2:N/2+1) + + CALL ZLARNV(2, ISEED, M, ZF(1,1) ) + DO i = 1, N-N/2 + CALL ZGEMV( 'N', M, M, ZONE, ZA, LDA, ZF(1,i), 1, & + ZZERO, ZF(1,i+1), 1 ) + END DO + ZX0(1:M,N/2+1:N) = ZF(1:M,1:N-N/2) + ZY0(1:M,N/2+1:N) = ZF(1:M,2:N-N/2+1) + ELSE + CALL ZLARNV(2, ISEED, M, ZF(1,1) ) + DO i = 1, N + CALL ZGEMV( 'N', M, M, ZONE, ZA, M, ZF(1,i), 1, & + ZZERO, ZF(1,i+1), 1 ) + END DO + ZF0(1:M,1:N+1) = ZF(1:M,1:N+1) + ZX0(1:M,1:N) = ZF0(1:M,1:N) + ZY0(1:M,1:N) = ZF0(1:M,2:N+1) + END IF + + DEALLOCATE( ZEIGSA ) +!........................................................................ + + DO iJOBZ = 1, 4 + + SELECT CASE ( iJOBZ ) + CASE(1) + JOBZ = 'V' + RESIDS = 'R' + CASE(2) + JOBZ = 'V' + RESIDS = 'N' + CASE(3) + JOBZ = 'F' + RESIDS = 'N' + CASE(4) + JOBZ = 'N' + RESIDS = 'N' + END SELECT + + DO iJOBREF = 1, 3 + + SELECT CASE ( iJOBREF ) + CASE(1) + JOBREF = 'R' + CASE(2) + JOBREF = 'E' + CASE(3) + JOBREF = 'N' + END SELECT + + DO iSCALE = 1, 4 + + SELECT CASE ( iSCALE ) + CASE(1) + SCALE = 'S' + CASE(2) + SCALE = 'C' + CASE(3) + SCALE = 'Y' + CASE(4) + SCALE = 'N' + END SELECT + + DO iNRNK = -1, -2, -1 + NRNK = iNRNK + NRNKsp = iNRNK + + DO iWHTSVD = 1, 3 + ! Check all four options to compute the POD basis + ! via the SVD. + WHTSVD = iWHTSVD + WHTSVDsp = iWHTSVD + + DO LWMINOPT = 1, 2 + ! Workspace query for the minimal (1) and for the optimal + ! (2) workspace lengths determined by workspace query. + + ! ZGEDMD is always tested and its results are also used for + ! comparisons with ZGEDMDQ. + + ZX(1:M,1:N) = ZX0(1:M,1:N) + ZY(1:M,1:N) = ZY0(1:M,1:N) + + CALL ZGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, ZX, LDX, ZY, LDY, NRNK, TOL, & + K, ZEIGS, ZZ, LDZ, RES, ZAU, LDAU, & + ZW, LDW, ZS, LDS, ZDUMMY, -1, & + WDUMMY, -1, IDUMMY, -1, INFO ) + IF ( (INFO .EQ. 2) .OR. ( INFO .EQ. 3 ) & + .OR. ( INFO < 0 ) ) THEN + WRITE(*,*) 'Call to ZGEDMD workspace query failed. & + &Check the calling sequence and the code.' + WRITE(*,*) 'The error code is ', INFO + WRITE(*,*) 'The input parameters were ', & + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL, LDZ, LDAU, LDW, LDS + STOP + END IF + + LZWORK = INT(ZDUMMY(LWMINOPT)) + LWORK = INT(WDUMMY(1)) + LIWORK = IDUMMY(1) + + ALLOCATE(ZWORK(LZWORK)) + ALLOCATE( WORK(LWORK)) + ALLOCATE(IWORK(LIWORK)) + + CALL ZGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, ZX, LDX, ZY, LDY, NRNK, TOL, & + K, ZEIGS, ZZ, LDZ, RES, ZAU, LDAU, & + ZW, LDW, ZS, LDS, ZWORK, LZWORK, & + WORK, LWORK, IWORK, LIWORK, INFO ) + + IF ( INFO /= 0 ) THEN + WRITE(*,*) 'Call to ZGEDMD failed. & + &Check the calling sequence and the code.' + WRITE(*,*) 'The error code is ', INFO + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + STOP + END IF + + SINGVX(1:N) = WORK(1:N) + + !...... ZGEDMD check point + IF ( LSAME(JOBZ,'V') ) THEN + ! Check that Z = X*W, on return from ZGEDMD + ! This checks that the returned eigenvectors in Z are + ! the product of the SVD'POD basis returned in X + ! and the eigenvectors of the rayleigh quotient + ! returned in W + CALL ZGEMM( 'N', 'N', M, K, K, ZONE, ZX, LDX, ZW, LDW, & + ZZERO, ZZ1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL ZAXPY( M, -ZONE, ZZ(1,i), 1, ZZ1(1,i), 1) + TMP = MAX(TMP, DZNRM2( M, ZZ1(1,i), 1 ) ) + END DO + TMP_ZXW = MAX(TMP_ZXW, TMP ) + IF ( TMP_ZXW <= 10*M*EPS ) THEN + !WRITE(*,*) ' :) .... OK .........ZGEDMD PASSED.' + ELSE + NFAIL_Z_XV = NFAIL_Z_XV + 1 + WRITE(*,*) ':( .................ZGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + END IF + + + !...... ZGEDMD check point + IF ( LSAME(JOBREF,'R') ) THEN + ! The matrix A*U is returned for computing refined Ritz vectors. + ! Check that A*U is computed correctly using the formula + ! A*U = Y * V * inv(SIGMA). This depends on the + ! accuracy in the computed singular values and vectors of X. + ! See the paper for an error analysis. + ! Note that the left singular vectors of the input matrix X + ! are returned in the array X. + CALL ZGEMM( 'N', 'N', M, K, M, ZONE, ZA, LDA, ZX, LDX, & + ZZERO, ZZ1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL ZAXPY( M, -ZONE, ZAU(1,i), 1, ZZ1(1,i), 1) + TMP = MAX( TMP, DZNRM2( M, ZZ1(1,i),1 ) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_AU = MAX( TMP_AU, TMP ) + IF ( TMP <= TOL2 ) THEN + !WRITE(*,*) ':) .... OK .........ZGEDMD PASSED.' + ELSE + NFAIL_AU = NFAIL_AU + 1 + WRITE(*,*) ':( .................ZGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + ELSEIF ( LSAME(JOBREF,'E') ) THEN + ! The unscaled vectors of the Exact DMD are computed. + ! This option is included for the sake of completeness, + ! for users who prefer the Exact DMD vectors. The + ! returned vectors are in the real form, in the same way + ! as the Ritz vectors. Here we just save the vectors + ! and test them separately using a Matlab script. + + + CALL ZGEMM( 'N', 'N', M, K, M, ZONE, ZA, LDA, ZAU, LDAU, ZZERO, ZY1, LDY ) + + DO i=1, K + ! have a real eigenvalue with real eigenvector + CALL ZAXPY( M, -ZEIGS(i), ZAU(1,i), 1, ZY1(1,i), 1 ) + RESEX(i) = DZNRM2( M, ZY1(1,i), 1) / DZNRM2(M,ZAU(1,i),1) + END DO + END IF + !...... ZGEDMD check point + + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by ZGEDMD with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL ZGEMM( 'N', 'N', M, K, M, ZONE, ZA, LDA, ZZ, LDZ, ZZERO, ZY1, LDY ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in ZGEDMD,) + + DO i=1, K + ! have a real eigenvalue with real eigenvector + CALL ZAXPY( M, -ZEIGS(i), ZZ(1,i), 1, ZY1(1,i), 1 ) + RES1(i) = DZNRM2( M, ZY1(1,i), 1) + END DO + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_REZ = MAX( TMP_REZ, TMP ) + IF ( TMP <= TOL2 ) THEN + !WRITE(*,*) ':) .... OK ..........ZGEDMD PASSED.' + ELSE + NFAIL_REZ = NFAIL_REZ + 1 + WRITE(*,*) ':( ..................ZGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + + + IF ( LSAME(JOBREF,'E') ) THEN + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) ) + END DO + TMP_EX = MAX(TMP_EX,TMP) + END IF + + END IF + + DEALLOCATE(ZWORK) + DEALLOCATE(WORK) + DEALLOCATE(IWORK) + + IF ( TEST_QRDMD .AND. (K_TRAJ == 1) ) THEN + + ZF(1:M,1:N+1) = ZF0(1:M,1:N+1) + + CALL ZGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, & + WHTSVD, M, N+1, ZF, LDF, ZX, LDX, ZY, LDY, & + NRNK, TOL, K, ZEIGS, ZZ, LDZ, RES, ZAU, & + LDAU, ZW, LDW, ZS, LDS, ZDUMMY, -1, & + WDUMMY, -1, IDUMMY, -1, INFO ) + + LZWORK = INT(ZDUMMY(LWMINOPT)) + ALLOCATE( ZWORK(LZWORK) ) + LIWORK = IDUMMY(1) + ALLOCATE(IWORK(LIWORK)) + LWORK = INT(WDUMMY(1)) + ALLOCATE(WORK(LWORK)) + + CALL ZGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, & + WHTSVD, M, N+1, ZF, LDF, ZX, LDX, ZY, LDY, & + NRNK, TOL, KQ, ZEIGS, ZZ, LDZ, RES, ZAU, & + LDAU, ZW, LDW, ZS, LDS, ZWORK, LZWORK, & + WORK, LWORK, IWORK, LIWORK, INFO ) + + IF ( INFO /= 0 ) THEN + WRITE(*,*) 'Call to ZGEDMDQ failed. & + &Check the calling sequence and the code.' + WRITE(*,*) 'The error code is ', INFO + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, WANTQ, WANTR, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + STOP + END IF + SINGVQX(1:N) = WORK(1:N) + + !..... ZGEDMDQ check point + + IF ( 1 == 0 ) THEN + ! Comparison of ZGEDMD and ZGEDMDQ singular values disabled + TMP = ZERO + DO i = 1, MIN(K, KQ) + TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / & + SINGVX(1) ) + END DO + SVDIFF = MAX( SVDIFF, TMP ) + IF ( TMP > M*N*EPS ) THEN + WRITE(*,*) 'FAILED! Something was wrong with the run.' + NFAIL_SVDIFF = NFAIL_SVDIFF + 1 + DO j =1, 3 + write(*,*) j, SINGVX(j), SINGVQX(j) + read(*,*) + END DO + + END IF + END IF + + !..... ZGEDMDQ check point + IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN + ! Check that the QR factors are computed and returned + ! as requested. The residual ||F-Q*R||_F / ||F||_F + ! is compared to M*N*EPS. + ZF1(1:M,1:N+1) = ZF0(1:M,1:N+1) + CALL ZGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -ZONE, ZF, & + LDF, ZY, LDY, ZONE, ZF1, LDF ) + TMP_FQR = ZLANGE( 'F', M, N+1, ZF1, LDF, WORK ) / & + ZLANGE( 'F', M, N+1, ZF0, LDF, WORK ) + IF ( TMP_FQR > TOL2 ) THEN + WRITE(*,*) 'FAILED! Something was wrong with the run.' + NFAIL_F_QR = NFAIL_F_QR + 1 + ELSE + !WRITE(*,*) '........ PASSED.' + END IF + END IF + + !..... ZGEDMDQ check point + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by ZGEDMDQ with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL ZGEMM( 'N', 'N', M, KQ, M, ZONE, ZA, LDA, ZZ, LDZ, ZZERO, ZY1, LDY ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in ZGEDMDQ) + + DO i=1, KQ + ! have a real eigenvalue with real eigenvector + CALL ZAXPY( M, -ZEIGS(i), ZZ(1,i), 1, ZY1(1,i), 1 ) + ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i) + RES1(i) = DZNRM2( M, ZY1(1,i), 1) + END DO + TMP = ZERO + DO i = 1, KQ + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVQX(KQ)/(ANORM*SINGVQX(1)) ) + END DO + TMP_REZQ = MAX( TMP_REZQ, TMP ) + IF ( TMP <= TOL2 ) THEN + !WRITE(*,*) '.... OK ........ ZGEDMDQ PASSED.' + ELSE + NFAIL_REZQ = NFAIL_REZQ + 1 + WRITE(*,*) '................ ZGEDMDQ FAILED!', & + 'Check the code for implementation errors.' + STOP + END IF + + END IF + + DEALLOCATE( ZWORK ) + DEALLOCATE( WORK ) + DEALLOCATE( IWORK ) + + END IF ! ZGEDMDQ + +!....................................................................................................... + + END DO ! LWMINOPT + !write(*,*) 'LWMINOPT loop completed' + END DO ! iWHTSVD + !write(*,*) 'WHTSVD loop completed' + END DO ! iNRNK -2:-1 + !write(*,*) 'NRNK loop completed' + END DO ! iSCALE 1:4 + !write(*,*) 'SCALE loop completed' + END DO + !write(*,*) 'JOBREF loop completed' + END DO ! iJOBZ + !write(*,*) 'JOBZ loop completed' + + END DO ! MODE -6:6 + !write(*,*) 'MODE loop completed' + END DO ! 1 or 2 trajectories + !write(*,*) 'trajectories loop completed' + + DEALLOCATE( ZA ) + DEALLOCATE( ZAC ) + DEALLOCATE( ZZ ) + DEALLOCATE( ZF ) + DEALLOCATE( ZF0 ) + DEALLOCATE( ZF1 ) + DEALLOCATE( ZX ) + DEALLOCATE( ZX0 ) + DEALLOCATE( ZY ) + DEALLOCATE( ZY0 ) + DEALLOCATE( ZY1 ) + DEALLOCATE( ZAU ) + DEALLOCATE( ZW ) + DEALLOCATE( ZS ) + DEALLOCATE( ZZ1 ) + DEALLOCATE( RES ) + DEALLOCATE( RES1 ) + DEALLOCATE( RESEX ) + DEALLOCATE( ZEIGS ) + DEALLOCATE( SINGVX ) + DEALLOCATE( SINGVQX ) + + END DO ! LLOOP + + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for ZGEDMD :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + IF ( NFAIL_Z_XV == 0 ) THEN + WRITE(*,*) '>>>> Z - U*V test PASSED.' + ELSE + WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)' + WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_ZXW + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_Z_XV + END IF + IF ( NFAIL_AU == 0 ) THEN + WRITE(*,*) '>>>> A*U test PASSED. ' + ELSE + WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)' + WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU + END IF + + IF ( NFAIL_REZ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ + END IF + + IF ( NFAIL_TOTAL == 0 ) THEN + WRITE(*,*) '>>>> ZGEDMD :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAIL_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>>>>>>>>> ZGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + IF ( TEST_QRDMD ) THEN + WRITE(*,*) + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for ZGEDMDQ :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + + IF ( NFAIL_SVDIFF == 0 ) THEN + WRITE(*,*) '>>>> ZGEDMD and ZGEDMDQ computed singular & + &values test PASSED.' + ELSE + WRITE(*,*) 'ZGEDMD and ZGEDMDQ discrepancies in & + &the singular values unacceptable ', & + NFAIL_SVDIFF, ' times. Test FAILED.' + WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF + END IF + + IF ( NFAIL_F_QR == 0 ) THEN + WRITE(*,*) '>>>> F - Q*R test PASSED.' + ELSE + WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)' + WRITE(*,*) 'The largest relative residual was ', TMP_FQR + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR + END IF + + IF ( NFAIL_REZQ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ + END IF + + IF ( NFAILQ_TOTAL == 0 ) THEN + WRITE(*,*) '>>>>>>> ZGEDMDQ :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>> ZGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + END IF + + WRITE(*,*) + WRITE(*,*) 'Test completed.' + STOP + END diff --git a/lapack-netlib/TESTING/EIG/zerrst.f b/lapack-netlib/TESTING/EIG/zerrst.f index d7b41c053..31881c4de 100644 --- a/lapack-netlib/TESTING/EIG/zerrst.f +++ b/lapack-netlib/TESTING/EIG/zerrst.f @@ -748,17 +748,17 @@ CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 2*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) NT = NT + 12 @@ -830,19 +830,19 @@ INFOT = 18 CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 26*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) NT = NT + 13 diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index 676857a80..143fd0597 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -9,7 +9,7 @@ set(DZLNTST dlaord.f) set(SLINTST schkaa.F schkeq.f schkgb.f schkge.f schkgt.f schklq.f schkpb.f schkpo.f schkps.f schkpp.f - schkpt.f schkq3.f schkql.f schkqr.f schkrq.f + schkpt.f schkq3.f schkqp3rk.f schkql.f schkqr.f schkrq.f schksp.f schksy.f schksy_rook.f schksy_rk.f schksy_aa.f schksy_aa_2stage.f schktb.f schktp.f schktr.f @@ -56,7 +56,7 @@ set(CLINTST cchkaa.F cchkhe.f cchkhe_rook.f cchkhe_rk.f cchkhe_aa.f cchkhe_aa_2stage.f cchkhp.f cchklq.f cchkpb.f - cchkpo.f cchkps.f cchkpp.f cchkpt.f cchkq3.f cchkql.f + cchkpo.f cchkps.f cchkpp.f cchkpt.f cchkq3.f cchkqp3rk.f cchkql.f cchkqr.f cchkrq.f cchksp.f cchksy.f cchksy_rook.f cchksy_rk.f cchksy_aa.f cchksy_aa_2stage.f cchktb.f @@ -110,7 +110,7 @@ endif() set(DLINTST dchkaa.F dchkeq.f dchkgb.f dchkge.f dchkgt.f dchklq.f dchkpb.f dchkpo.f dchkps.f dchkpp.f - dchkpt.f dchkq3.f dchkql.f dchkqr.f dchkrq.f + dchkpt.f dchkq3.f dchkqp3rk.f dchkql.f dchkqr.f dchkrq.f dchksp.f dchksy.f dchksy_rook.f dchksy_rk.f dchksy_aa.f dchksy_aa_2stage.f dchktb.f dchktp.f dchktr.f @@ -158,7 +158,7 @@ set(ZLINTST zchkaa.F zchkhe.f zchkhe_rook.f zchkhe_rk.f zchkhe_aa.f zchkhe_aa_2stage.f zchkhp.f zchklq.f zchkpb.f - zchkpo.f zchkps.f zchkpp.f zchkpt.f zchkq3.f zchkql.f + zchkpo.f zchkps.f zchkpp.f zchkpt.f zchkq3.f zchkqp3rk.f zchkql.f zchkqr.f zchkrq.f zchksp.f zchksy.f zchksy_rook.f zchksy_rk.f zchksy_aa.f zchksy_aa_2stage.f zchktb.f diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile index 64abc4dba..714efa52a 100644 --- a/lapack-netlib/TESTING/LIN/Makefile +++ b/lapack-netlib/TESTING/LIN/Makefile @@ -45,7 +45,7 @@ DZLNTST = dlaord.o SLINTST = schkaa.o \ schkeq.o schkgb.o schkge.o schkgt.o \ schklq.o schkpb.o schkpo.o schkps.o schkpp.o \ - schkpt.o schkq3.o schkql.o schkqr.o schkrq.o \ + schkpt.o schkq3.o schkqp3rk.o schkql.o schkqr.o schkrq.o \ schksp.o schksy.o schksy_rook.o schksy_rk.o \ schksy_aa.o schksy_aa_2stage.o schktb.o schktp.o schktr.o \ schktz.o \ @@ -89,7 +89,7 @@ CLINTST = cchkaa.o \ cchkeq.o cchkgb.o cchkge.o cchkgt.o \ cchkhe.o cchkhe_rook.o cchkhe_rk.o \ cchkhe_aa.o cchkhe_aa_2stage.o cchkhp.o cchklq.o cchkpb.o \ - cchkpo.o cchkps.o cchkpp.o cchkpt.o cchkq3.o cchkql.o \ + cchkpo.o cchkps.o cchkpp.o cchkpt.o cchkq3.o cchkqp3rk.o cchkql.o \ cchkqr.o cchkrq.o cchksp.o cchksy.o cchksy_rook.o cchksy_rk.o \ cchksy_aa.o cchksy_aa_2stage.o cchktb.o \ cchktp.o cchktr.o cchktz.o \ @@ -137,7 +137,7 @@ endif DLINTST = dchkaa.o \ dchkeq.o dchkgb.o dchkge.o dchkgt.o \ dchklq.o dchkpb.o dchkpo.o dchkps.o dchkpp.o \ - dchkpt.o dchkq3.o dchkql.o dchkqr.o dchkrq.o \ + dchkpt.o dchkq3.o dchkqp3rk.o dchkql.o dchkqr.o dchkrq.o \ dchksp.o dchksy.o dchksy_rook.o dchksy_rk.o \ dchksy_aa.o dchksy_aa_2stage.o dchktb.o dchktp.o dchktr.o \ dchktz.o \ @@ -182,7 +182,7 @@ ZLINTST = zchkaa.o \ zchkeq.o zchkgb.o zchkge.o zchkgt.o \ zchkhe.o zchkhe_rook.o zchkhe_rk.o zchkhe_aa.o zchkhe_aa_2stage.o \ zchkhp.o zchklq.o zchkpb.o \ - zchkpo.o zchkps.o zchkpp.o zchkpt.o zchkq3.o zchkql.o \ + zchkpo.o zchkps.o zchkpp.o zchkpt.o zchkq3.o zchkqp3rk.o zchkql.o \ zchkqr.o zchkrq.o zchksp.o zchksy.o zchksy_rook.o zchksy_rk.o \ zchksy_aa.o zchksy_aa_2stage.o zchktb.o \ zchktp.o zchktr.o zchktz.o \ diff --git a/lapack-netlib/TESTING/LIN/alaerh.f b/lapack-netlib/TESTING/LIN/alaerh.f index 1845888a6..6c8a47f1e 100644 --- a/lapack-netlib/TESTING/LIN/alaerh.f +++ b/lapack-netlib/TESTING/LIN/alaerh.f @@ -797,6 +797,18 @@ WRITE( NOUT, FMT = 9978 ) $ SUBNAM(1:LEN_TRIM( SUBNAM )), INFO, M, N, IMAT END IF +* + ELSE IF( LSAMEN( 2, P2, 'QK' ) ) THEN +* +* xQK: truncated QR factorization with pivoting +* + IF( LSAMEN( 7, SUBNAM( 2: 8 ), 'GEQP3RK' ) ) THEN + WRITE( NOUT, FMT = 9930 ) + $ SUBNAM(1:LEN_TRIM( SUBNAM )), INFO, M, N, KL, N5, IMAT + ELSE IF( LSAMEN( 5, SUBNAM( 2: 6 ), 'LATMS' ) ) THEN + WRITE( NOUT, FMT = 9978 ) + $ SUBNAM(1:LEN_TRIM( SUBNAM )), INFO, M, N, IMAT + END IF * ELSE IF( LSAMEN( 2, P2, 'LQ' ) ) THEN * @@ -1147,6 +1159,11 @@ * What we do next * 9949 FORMAT( ' ==> Doing only the condition estimate for this case' ) +* +* SUBNAM, INFO, M, N, NB, IMAT +* + 9930 FORMAT( ' *** Error code from ', A, '=', I5, / ' ==> M =', I5, + $ ', N =', I5, ', NX =', I5, ', NB =', I4, ', type ', I2 ) * RETURN * diff --git a/lapack-netlib/TESTING/LIN/alahd.f b/lapack-netlib/TESTING/LIN/alahd.f index dd75394b3..8f966c584 100644 --- a/lapack-netlib/TESTING/LIN/alahd.f +++ b/lapack-netlib/TESTING/LIN/alahd.f @@ -584,13 +584,27 @@ * * QR decomposition with column pivoting * - WRITE( IOUNIT, FMT = 9986 )PATH + WRITE( IOUNIT, FMT = 8006 )PATH WRITE( IOUNIT, FMT = 9969 ) WRITE( IOUNIT, FMT = '( '' Test ratios:'' )' ) WRITE( IOUNIT, FMT = 9940 )1 WRITE( IOUNIT, FMT = 9939 )2 WRITE( IOUNIT, FMT = 9938 )3 WRITE( IOUNIT, FMT = '( '' Messages:'' )' ) +* + ELSE IF( LSAMEN( 2, P2, 'QK' ) ) THEN +* +* truncated QR decomposition with column pivoting +* + WRITE( IOUNIT, FMT = 8006 )PATH + WRITE( IOUNIT, FMT = 9871 ) + WRITE( IOUNIT, FMT = '( '' Test ratios:'' )' ) + WRITE( IOUNIT, FMT = 8060 )1 + WRITE( IOUNIT, FMT = 8061 )2 + WRITE( IOUNIT, FMT = 8062 )3 + WRITE( IOUNIT, FMT = 8063 )4 + WRITE( IOUNIT, FMT = 8064 )5 + WRITE( IOUNIT, FMT = '( '' Messages:'' )' ) * ELSE IF( LSAMEN( 2, P2, 'TZ' ) ) THEN * @@ -779,6 +793,8 @@ $ 'tall-skinny or short-wide matrices' ) 8005 FORMAT( / 1X, A3, ': Householder reconstruction from TSQR', $ ' factorization output ', /,' for tall-skinny matrices.' ) + 8006 FORMAT( / 1X, A3, ': truncated QR factorization', + $ ' with column pivoting' ) * * GE matrix types * @@ -922,6 +938,36 @@ $ / 4X, '3. Geometric distribution', 10X, $ '6. Every second column fixed' ) * +* QK matrix types +* + 9871 FORMAT( 4X, ' 1. Zero matrix', / + $ 4X, ' 2. Random, Diagonal, CNDNUM = 2', / + $ 4X, ' 3. Random, Upper triangular, CNDNUM = 2', / + $ 4X, ' 4. Random, Lower triangular, CNDNUM = 2', / + $ 4X, ' 5. Random, First column is zero, CNDNUM = 2', / + $ 4X, ' 6. Random, Last MINMN column is zero, CNDNUM = 2', / + $ 4X, ' 7. Random, Last N column is zero, CNDNUM = 2', / + $ 4X, ' 8. Random, Middle column in MINMN is zero,', + $ ' CNDNUM = 2', / + $ 4X, ' 9. Random, First half of MINMN columns are zero,', + $ ' CNDNUM = 2', / + $ 4X, '10. Random, Last columns are zero starting from', + $ ' MINMN/2+1, CNDNUM = 2', / + $ 4X, '11. Random, Half MINMN columns in the middle are', + $ ' zero starting from MINMN/2-(MINMN/2)/2+1,' + $ ' CNDNUM = 2', / + $ 4X, '12. Random, Odd columns are ZERO, CNDNUM = 2', / + $ 4X, '13. Random, Even columns are ZERO, CNDNUM = 2', / + $ 4X, '14. Random, CNDNUM = 2', / + $ 4X, '15. Random, CNDNUM = sqrt(0.1/EPS)', / + $ 4X, '16. Random, CNDNUM = 0.1/EPS', / + $ 4X, '17. Random, CNDNUM = 0.1/EPS,', + $ ' one small singular value S(N)=1/CNDNUM', / + $ 4X, '18. Random, CNDNUM = 2, scaled near underflow,', + $ ' NORM = SMALL = SAFMIN', / + $ 4X, '19. Random, CNDNUM = 2, scaled near overflow,', + $ ' NORM = LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) )' ) +* * TZ matrix types * 9968 FORMAT( ' Matrix types (2-3 have condition 1/EPS):', / 4X, @@ -1030,9 +1076,8 @@ $ ' * norm(C) * EPS )' ) 9940 FORMAT( 3X, I2, ': norm(svd(A) - svd(R)) / ', $ '( M * norm(svd(R)) * EPS )' ) - 9939 FORMAT( 3X, I2, ': norm( A*P - Q*R ) / ( M * norm(A) * EPS )' - $ ) - 9938 FORMAT( 3X, I2, ': norm( I - Q''*Q ) / ( M * EPS )' ) + 9939 FORMAT( 3X, I2, ': norm( A*P - Q*R ) / ( M * norm(A) * EPS )') + 9938 FORMAT( 3X, I2, ': norm( I - Q''*Q ) / ( M * EPS )' ) 9937 FORMAT( 3X, I2, ': norm( A - R*Q ) / ( M * norm(A) * EPS )' $ ) 9935 FORMAT( 3X, I2, ': norm( B - A * X ) / ', @@ -1105,6 +1150,15 @@ 8054 FORMAT(3X,I2,': norm( C*Q - C*Q ) / ( M * norm(C) * EPS )' ) 8055 FORMAT(3X,I2,': norm( C*Q'' - C*Q'' ) / ( M * norm(C) * EPS )') + 8060 FORMAT( 3X, I2, ': 2-norm(svd(A) - svd(R)) / ', + $ '( max(M,N) * 2-norm(svd(R)) * EPS )' ) + 8061 FORMAT( 3X, I2, ': 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A)', + $ ' * EPS )') + 8062 FORMAT( 3X, I2, ': 1-norm( I - Q''*Q ) / ( M * EPS )' ) + 8063 FORMAT( 3X, I2, ': Returns 1.0D+100, if abs(R(K+1,K+1))', + $ ' > abs(R(K,K)), where K=1:KFACT-1' ) + 8064 FORMAT( 3X, I2, ': 1-norm(Q**T * B - Q**T * B ) / ( M * EPS )') + * RETURN * diff --git a/lapack-netlib/TESTING/LIN/alareq.f b/lapack-netlib/TESTING/LIN/alareq.f index db18775eb..3f057fa48 100644 --- a/lapack-netlib/TESTING/LIN/alareq.f +++ b/lapack-netlib/TESTING/LIN/alareq.f @@ -28,12 +28,12 @@ *> to evaluate the input line which requested NMATS matrix types for *> PATH. The flow of control is as follows: *> -*> If NMATS = NTYPES then +*> IF NMATS = NTYPES THEN *> DOTYPE(1:NTYPES) = .TRUE. -*> else +*> ELSE *> Read the next input line for NMATS matrix types *> Set DOTYPE(I) = .TRUE. for each valid type I -*> endif +*> END IF *> \endverbatim * * Arguments: diff --git a/lapack-netlib/TESTING/LIN/cchkaa.F b/lapack-netlib/TESTING/LIN/cchkaa.F index ec1534ed4..57d95c741 100644 --- a/lapack-netlib/TESTING/LIN/cchkaa.F +++ b/lapack-netlib/TESTING/LIN/cchkaa.F @@ -69,6 +69,7 @@ *> CLQ 8 List types on next line if 0 < NTYPES < 8 *> CQL 8 List types on next line if 0 < NTYPES < 8 *> CQP 6 List types on next line if 0 < NTYPES < 6 +*> ZQK 19 List types on next line if 0 < NTYPES < 19 *> CTZ 3 List types on next line if 0 < NTYPES < 3 *> CLS 6 List types on next line if 0 < NTYPES < 6 *> CEQ @@ -153,12 +154,11 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - REAL S( 2*NMAX ) - COMPLEX E( NMAX ) * .. * .. Allocatable Arrays .. INTEGER AllocateStatus - REAL, DIMENSION(:), ALLOCATABLE :: RWORK + REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S + COMPLEX, DIMENSION(:), ALLOCATABLE :: E COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK * .. * .. External Functions .. @@ -170,14 +170,14 @@ EXTERNAL ALAREQ, CCHKEQ, CCHKGB, CCHKGE, CCHKGT, CCHKHE, $ CCHKHE_ROOK, CCHKHE_RK, CCHKHE_AA, CCHKHP, $ CCHKLQ, CCHKUNHR_COL, CCHKPB, CCHKPO, CCHKPS, - $ CCHKPP, CCHKPT, CCHKQ3, CCHKQL, CCHKQR, CCHKRQ, - $ CCHKSP, CCHKSY, CCHKSY_ROOK, CCHKSY_RK, - $ CCHKSY_AA, CCHKTB, CCHKTP, CCHKTR, CCHKTZ, - $ CDRVGB, CDRVGE, CDRVGT, CDRVHE, CDRVHE_ROOK, - $ CDRVHE_RK, CDRVHE_AA, CDRVHP, CDRVLS, CDRVPB, - $ CDRVPO, CDRVPP, CDRVPT, CDRVSP, CDRVSY, - $ CDRVSY_ROOK, CDRVSY_RK, CDRVSY_AA, ILAVER, - $ CCHKQRT, CCHKQRTP + $ CCHKPP, CCHKPT, CCHKQ3, CCHKQP3RK, CCHKQL, + $ CCHKQR, CCHKRQ, CCHKSP, CCHKSY, CCHKSY_ROOK, + $ CCHKSY_RK, CCHKSY_AA, CCHKTB, CCHKTP, CCHKTR, + $ CCHKTZ, CDRVGB, CDRVGE, CDRVGT, CDRVHE, + $ CDRVHE_ROOK, CDRVHE_RK, CDRVHE_AA, CDRVHP, + $ CDRVLS, CDRVPB, CDRVPO, CDRVPP, CDRVPT, CDRVSP, + $ CDRVSY, CDRVSY_ROOK, CDRVSY_RK, CDRVSY_AA, + $ ILAVER, CCHKQRT, CCHKQRTP * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -203,6 +203,10 @@ IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" ALLOCATE ( WORK( NMAX, NMAX+MAXRHS+10 ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( E( NMAX ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" ALLOCATE ( RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * .. @@ -1109,6 +1113,23 @@ ELSE WRITE( NOUT, FMT = 9989 )PATH END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* QK: truncated QR factorization with pivoting +* + NTYPES = 19 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ), + $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ S( 1 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF * ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN * @@ -1211,6 +1232,8 @@ * DEALLOCATE (A, STAT = AllocateStatus) DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (E, STAT = AllocateStatus) + DEALLOCATE (S, STAT = AllocateStatus) DEALLOCATE (WORK, STAT = AllocateStatus) DEALLOCATE (RWORK, STAT = AllocateStatus) * diff --git a/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f index 30a61261f..d79978e55 100644 --- a/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f @@ -433,9 +433,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'CHETRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) - CALL CHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX) + CALL CHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) @@ -517,7 +517,6 @@ c NT = 1 CALL CLACPY( 'Full', N, NRHS, B, LDA, X, LDA ) * SRNAMT = 'CHETRS_AA_2STAGE' - LWORK = MAX( 1, 3*N-2 ) CALL CHETRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, $ AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ), $ X, LDA, INFO ) diff --git a/lapack-netlib/TESTING/LIN/cchkqp3rk.f b/lapack-netlib/TESTING/LIN/cchkqp3rk.f new file mode 100644 index 000000000..79d6add72 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/cchkqp3rk.f @@ -0,0 +1,836 @@ +*> \brief \b CCHKQP3RK +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, +* $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, +* $ B, COPYB, S, TAU, +* $ WORK, RWORK, IWORK, NOUT ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER NM, NN, NNB, NOUT +* REAL THRESH +* .. +* .. Array Arguments .. +* LOGICAL DOTYPE( * ) +* INTEGER IWORK( * ), MVAL( * ), NBVAL( * ), NVAL( * ), +* $ NXVAL( * ) +* REAL S( * ), RWORK( * ) +* COMPLEX A( * ), COPYA( * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CCHKQP3RK tests CGEQP3RK. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] DOTYPE +*> \verbatim +*> DOTYPE is LOGICAL array, dimension (NTYPES) +*> The matrix types to be used for testing. Matrices of type j +*> (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) = +*> .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used. +*> \endverbatim +*> +*> \param[in] NM +*> \verbatim +*> NM is INTEGER +*> The number of values of M contained in the vector MVAL. +*> \endverbatim +*> +*> \param[in] MVAL +*> \verbatim +*> MVAL is INTEGER array, dimension (NM) +*> The values of the matrix row dimension M. +*> \endverbatim +*> +*> \param[in] NN +*> \verbatim +*> NN is INTEGER +*> The number of values of N contained in the vector NVAL. +*> \endverbatim +*> +*> \param[in] NVAL +*> \verbatim +*> NVAL is INTEGER array, dimension (NN) +*> The values of the matrix column dimension N. +*> \endverbatim +*> +*> \param[in] NNS +*> \verbatim +*> NNS is INTEGER +*> The number of values of NRHS contained in the vector NSVAL. +*> \endverbatim +*> +*> \param[in] NSVAL +*> \verbatim +*> NSVAL is INTEGER array, dimension (NNS) +*> The values of the number of right hand sides NRHS. +*> \endverbatim +*> \param[in] NNB +*> \verbatim +*> NNB is INTEGER +*> The number of values of NB and NX contained in the +*> vectors NBVAL and NXVAL. The blocking parameters are used +*> in pairs (NB,NX). +*> \endverbatim +*> +*> \param[in] NBVAL +*> \verbatim +*> NBVAL is INTEGER array, dimension (NNB) +*> The values of the blocksize NB. +*> \endverbatim +*> +*> \param[in] NXVAL +*> \verbatim +*> NXVAL is INTEGER array, dimension (NNB) +*> The values of the crossover point NX. +*> \endverbatim +*> +*> \param[in] THRESH +*> \verbatim +*> THRESH is REAL +*> The threshold value for the test ratios. A result is +*> included in the output file if RESULT >= THRESH. To have +*> every test ratio printed, use THRESH = 0. +*> \endverbatim +*> +*> \param[out] A +*> \verbatim +*> A is COMPLEX array, dimension (MMAX*NMAX) +*> where MMAX is the maximum value of M in MVAL and NMAX is the +*> maximum value of N in NVAL. +*> \endverbatim +*> +*> \param[out] COPYA +*> \verbatim +*> COPYA is COMPLEX array, dimension (MMAX*NMAX) +*> \endverbatim +*> +*> \param[out] B +*> \verbatim +*> B is COMPLEX array, dimension (MMAX*NSMAX) +*> where MMAX is the maximum value of M in MVAL and NSMAX is the +*> maximum value of NRHS in NSVAL. +*> \endverbatim +*> +*> \param[out] COPYB +*> \verbatim +*> COPYB is COMPLEX array, dimension (MMAX*NSMAX) +*> \endverbatim +*> +*> \param[out] S +*> \verbatim +*> S is REAL array, dimension +*> (min(MMAX,NMAX)) +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX array, dimension (MMAX) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension +*> (max(M*max(M,N) + 4*min(M,N) + max(M,N))) +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (4*NMAX) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (2*NMAX) +*> \endverbatim +*> +*> \param[in] NOUT +*> \verbatim +*> NOUT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex_lin +* +* ===================================================================== + SUBROUTINE CCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, + $ B, COPYB, S, TAU, + $ WORK, RWORK, IWORK, NOUT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER NM, NN, NNB, NNS, NOUT + REAL THRESH +* .. +* .. Array Arguments .. + LOGICAL DOTYPE( * ) + INTEGER IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ), + $ NSVAL( * ), NXVAL( * ) + REAL S( * ), RWORK( * ) + COMPLEX A( * ), COPYA( * ), B( * ), COPYB( * ), + $ TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NTYPES + PARAMETER ( NTYPES = 19 ) + INTEGER NTESTS + PARAMETER ( NTESTS = 5 ) + REAL ONE, ZERO, BIGNUM + COMPLEX CONE, CZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0, + $ CZERO = ( 0.0E+0, 0.0E+0 ), + $ CONE = ( 1.0E+0, 0.0E+0 ), + $ BIGNUM = 1.0E+38 ) +* .. +* .. Local Scalars .. + CHARACTER DIST, TYPE + CHARACTER*3 PATH + INTEGER I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO, + $ INB, IND_OFFSET_GEN, + $ IND_IN, IND_OUT, INS, INFO, + $ ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO, + $ KFACT, KL, KMAX, KU, LDA, LW, LWORK, + $ LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N, + $ NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS, + $ NRUN, NX, T + REAL ANORM, CNDNUM, EPS, ABSTOL, RELTOL, + $ DTEMP, MAXC2NRMK, RELMAXC2NRMK +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ), ISEEDY( 4 ) + REAL RESULT( NTESTS ), RDUMMY( 1 ) +* .. +* .. External Functions .. + REAL SLAMCH, CQPT01, CQRT11, CQRT12, CLANGE + EXTERNAL SLAMCH, CQPT01, CQRT11, CQRT12, CLANGE +* .. +* .. External Subroutines .. + EXTERNAL ALAERH, ALAHD, ALASUM, SLAORD, ICOPY, CAXPY, + $ XLAENV, CGEQP3RK, CLACPY, CLASET, CLATB4, + $ CLATMS, CUNMQR, CSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, MOD, REAL +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, IOUNIT, CUNMQR_LWORK +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, IOUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEEDY / 1988, 1989, 1990, 1991 / +* .. +* .. Executable Statements .. +* +* Initialize constants and the random number seed. +* + PATH( 1: 1 ) = 'Complex precision' + PATH( 2: 3 ) = 'QK' + NRUN = 0 + NFAIL = 0 + NERRS = 0 + DO I = 1, 4 + ISEED( I ) = ISEEDY( I ) + END DO + EPS = SLAMCH( 'Epsilon' ) + INFOT = 0 +* + DO IM = 1, NM +* +* Do for each value of M in MVAL. +* + M = MVAL( IM ) + LDA = MAX( 1, M ) +* + DO IN = 1, NN +* +* Do for each value of N in NVAL. +* + N = NVAL( IN ) + MINMN = MIN( M, N ) + LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ), + $ M*N + 2*MINMN + 4*N ) +* + DO INS = 1, NNS + NRHS = NSVAL( INS ) +* +* Set up parameters with CLATB4 and generate +* M-by-NRHS B matrix with CLATMS. +* IMAT = 14: +* Random matrix, CNDNUM = 2, NORM = ONE, +* MODE = 3 (geometric distribution of singular values). +* + CALL CLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'CLATMS' + CALL CLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYB, LDA, WORK, INFO ) +* +* Check error code from CLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'CLATMS', INFO, 0, ' ', M, + $ NRHS, -1, -1, -1, 6, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + DO IMAT = 1, NTYPES +* +* Do the tests only if DOTYPE( IMAT ) is true. +* + IF( .NOT.DOTYPE( IMAT ) ) + $ CYCLE +* +* The type of distribution used to generate the random +* eigen-/singular values: +* ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 ) +* +* Do for each type of NON-SYMMETRIC matrix: CNDNUM NORM MODE +* 1. Zero matrix +* 2. Random, Diagonal, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 3. Random, Upper triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 4. Random, Lower triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 5. Random, First column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 6. Random, Last MINMN column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 7. Random, Last N column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 8. Random, Middle column in MINMN is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 9. Random, First half of MINMN columns are zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 11. Random, Half MINMN columns in the middle are zero starting +* from MINMN/2-(MINMN/2)/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 12. Random, Odd columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 13. Random, Even columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 14. Random, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 15. Random, CNDNUM = sqrt(0.1/EPS) CNDNUM = BADC1 = sqrt(0.1/EPS) ONE 3 ( geometric distribution of singular values ) +* 16. Random, CNDNUM = 0.1/EPS CNDNUM = BADC2 = 0.1/EPS ONE 3 ( geometric distribution of singular values ) +* 17. Random, CNDNUM = 0.1/EPS, CNDNUM = BADC2 = 0.1/EPS ONE 2 ( one small singular value, S(N)=1/CNDNUM ) +* one small singular value S(N)=1/CNDNUM +* 18. Random, CNDNUM = 2, scaled near underflow CNDNUM = 2 SMALL = SAFMIN +* 19. Random, CNDNUM = 2, scaled near overflow CNDNUM = 2 LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) ) 3 ( geometric distribution of singular values ) +* + IF( IMAT.EQ.1 ) THEN +* +* Matrix 1: Zero matrix +* + CALL CLASET( 'Full', M, N, CZERO, CZERO, COPYA, LDA ) + DO I = 1, MINMN + S( I ) = ZERO + END DO +* + ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 ) + $ .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN +* +* Matrices 2-5. +* +* Set up parameters with DLATB4 and generate a test +* matrix with CLATMS. +* + CALL CLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'CLATMS' + CALL CLATMS( M, N, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA, LDA, WORK, INFO ) +* +* Check error code from CLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'CLATMS', INFO, 0, ' ', M, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + CALL SLAORD( 'Decreasing', MINMN, S, 1 ) +* + ELSE IF( MINMN.GE.2 + $ .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN +* +* Rectangular matrices 5-13 that contain zero columns, +* only for matrices MINMN >=2. +* +* JB_ZERO is the column index of ZERO block. +* NB_ZERO is the column block size of ZERO block. +* NB_GEN is the column blcok size of the +* generated block. +* J_INC in the non_zero column index increment +* for matrix 12 and 13. +* J_FIRS_NZ is the index of the first non-zero +* column. +* + IF( IMAT.EQ.5 ) THEN +* +* First column is zero. +* + JB_ZERO = 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.6 ) THEN +* +* Last column MINMN is zero. +* + JB_ZERO = MINMN + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.7 ) THEN +* +* Last column N is zero. +* + JB_ZERO = N + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.8 ) THEN +* +* Middle column in MINMN is zero. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.9 ) THEN +* +* First half of MINMN columns is zero. +* + JB_ZERO = 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.10 ) THEN +* +* Last columns are zero columns, +* starting from (MINMN / 2 + 1) column. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = N - JB_ZERO + 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.11 ) THEN +* +* Half of the columns in the middle of MINMN +* columns is zero, starting from +* MINMN/2 - (MINMN/2)/2 + 1 column. +* + JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.12 ) THEN +* +* Odd-numbered columns are zero, +* + NB_GEN = N / 2 + NB_ZERO = N - NB_GEN + J_INC = 2 + J_FIRST_NZ = 2 +* + ELSE IF( IMAT.EQ.13 ) THEN +* +* Even-numbered columns are zero. +* + NB_ZERO = N / 2 + NB_GEN = N - NB_ZERO + J_INC = 2 + J_FIRST_NZ = 1 +* + END IF +* +* +* 1) Set the first NB_ZERO columns in COPYA(1:M,1:N) +* to zero. +* + CALL CLASET( 'Full', M, NB_ZERO, CZERO, CZERO, + $ COPYA, LDA ) +* +* 2) Generate an M-by-(N-NB_ZERO) matrix with the +* chosen singular value distribution +* in COPYA(1:M,NB_ZERO+1:N). +* + CALL CLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU, + $ ANORM, MODE, CNDNUM, DIST ) +* + SRNAMT = 'CLATMS' +* + IND_OFFSET_GEN = NB_ZERO * LDA +* + CALL CLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA( IND_OFFSET_GEN + 1 ), LDA, + $ WORK, INFO ) +* +* Check error code from CLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'CLATMS', INFO, 0, ' ', M, + $ NB_GEN, -1, -1, -1, IMAT, NFAIL, + $ NERRS, NOUT ) + CYCLE + END IF +* +* 3) Swap the gererated colums from the right side +* NB_GEN-size block in COPYA into correct column +* positions. +* + IF( IMAT.EQ.6 + $ .OR. IMAT.EQ.7 + $ .OR. IMAT.EQ.8 + $ .OR. IMAT.EQ.10 + $ .OR. IMAT.EQ.11 ) THEN +* +* Move by swapping the generated columns +* from the right NB_GEN-size block from +* (NB_ZERO+1:NB_ZERO+JB_ZERO) +* into columns (1:JB_ZERO-1). +* + DO J = 1, JB_ZERO-1, 1 + CALL CSWAP( M, + $ COPYA( ( NB_ZERO+J-1)*LDA+1), 1, + $ COPYA( (J-1)*LDA + 1 ), 1 ) + END DO +* + ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN +* +* ( IMAT = 12, Odd-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the even zero colums in the +* left NB_ZERO-size block. +* +* ( IMAT = 13, Even-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the odd zero colums in the +* left NB_ZERO-size block. +* + DO J = 1, NB_GEN, 1 + IND_OUT = ( NB_ZERO+J-1 )*LDA + 1 + IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA + $ + 1 + CALL CSWAP( M, + $ COPYA( IND_OUT ), 1, + $ COPYA( IND_IN), 1 ) + END DO +* + END IF +* +* 5) Order the singular values generated by +* DLAMTS in decreasing order and add trailing zeros +* that correspond to zero columns. +* The total number of singular values is MINMN. +* + MINMNB_GEN = MIN( M, NB_GEN ) +* + CALL SLAORD( 'Decreasing', MINMNB_GEN, S, 1 ) + + DO I = MINMNB_GEN+1, MINMN + S( I ) = ZERO + END DO +* + ELSE +* +* IF(MINMN.LT.2) skip this size for this matrix type. +* + CYCLE + END IF +* +* Initialize a copy array for a pivot array for DGEQP3RK. +* + DO I = 1, N + IWORK( I ) = 0 + END DO +* + DO INB = 1, NNB +* +* Do for each pair of values (NB,NX) in NBVAL and NXVAL. +* + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) + NX = NXVAL( INB ) + CALL XLAENV( 3, NX ) +* +* We do MIN(M,N)+1 because we need a test for KMAX > N, +* when KMAX is larger than MIN(M,N), KMAX should be +* KMAX = MIN(M,N) +* + DO KMAX = 0, MIN(M,N)+1 +* +* Get a working copy of COPYA into A( 1:M,1:N ). +* Get a working copy of COPYB into A( 1:M, (N+1):NRHS ). +* Get a working copy of COPYB into into B( 1:M, 1:NRHS ). +* Get a working copy of IWORK(1:N) awith zeroes into +* which is going to be used as pivot array IWORK( N+1:2N ). +* NOTE: IWORK(2N+1:3N) is going to be used as a WORK array +* for the routine. +* + CALL CLACPY( 'All', M, N, COPYA, LDA, A, LDA ) + CALL CLACPY( 'All', M, NRHS, COPYB, LDA, + $ A( LDA*N + 1 ), LDA ) + CALL CLACPY( 'All', M, NRHS, COPYB, LDA, + $ B, LDA ) + CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 ) +* + ABSTOL = -1.0 + RELTOl = -1.0 +* +* Compute the QR factorization with pivoting of A +* + LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ), + $ 3*N + NRHS - 1 ) ) +* +* Compute CGEQP3RK factorization of A. +* + SRNAMT = 'CGEQP3RK' + CALL CGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ A, LDA, KFACT, MAXC2NRMK, + $ RELMAXC2NRMK, IWORK( N+1 ), TAU, + $ WORK, LW, RWORK, IWORK( 2*N+1 ), + $ INFO ) +* +* Check error code from CGEQP3RK. +* + IF( INFO.LT.0 ) + $ CALL ALAERH( PATH, 'CGEQP3RK', INFO, 0, ' ', + $ M, N, NX, -1, NB, IMAT, + $ NFAIL, NERRS, NOUT ) +* + IF( KFACT.EQ.MINMN ) THEN +* +* Compute test 1: +* +* This test in only for the full rank factorization of +* the matrix A. +* +* Array S(1:min(M,N)) contains svd(A) the sigular values +* of the original matrix A in decreasing absolute value +* order. The test computes svd(R), the vector sigular +* values of the upper trapezoid of A(1:M,1:N) that +* contains the factor R, in decreasing order. The test +* returns the ratio: +* +* 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS ) +* + RESULT( 1 ) = CQRT12( M, N, A, LDA, S, WORK, + $ LWORK , RWORK ) +* + DO T = 1, 1 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, NB, NX, + $ IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 1 +* + END IF + +* Compute test 2: +* +* The test returns the ratio: +* +* 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS ) +* + RESULT( 2 ) = CQPT01( M, N, KFACT, COPYA, A, LDA, TAU, + $ IWORK( N+1 ), WORK, LWORK ) +* +* Compute test 3: +* +* The test returns the ratio: +* +* 1-norm( Q**T * Q - I ) / ( M * EPS ) +* + RESULT( 3 ) = CQRT11( M, KFACT, A, LDA, TAU, WORK, + $ LWORK ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 2, 3 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 +* +* Compute test 4: +* +* This test is only for the factorizations with the +* rank greater than 2. +* The elements on the diagonal of R should be non- +* increasing. +* +* The test returns the ratio: +* +* Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)), +* K=1:KFACT-1 +* + IF( MIN(KFACT, MINMN).GE.2 ) THEN +* + DO J = 1, KFACT-1, 1 +* + DTEMP = (( ABS( A( (J-1)*M+J ) ) - + $ ABS( A( (J)*M+J+1 ) ) ) / + $ ABS( A(1) ) ) +* + IF( DTEMP.LT.ZERO ) THEN + RESULT( 4 ) = BIGNUM + END IF +* + END DO +* +* Print information about the tests that did not +* pass the threshold. +* + DO T = 4, 4 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', + $ M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, + $ RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 4. +* + END IF +* +* Compute test 5: +* +* This test in only for matrix A with min(M,N) > 0. +* +* The test returns the ratio: +* +* 1-norm(Q**T * B - Q**T * B ) / +* ( M * EPS ) +* +* (1) Compute B:=Q**T * B in the matrix B. +* + IF( MINMN.GT.0 ) THEN +* + LWORK_MQR = MAX(1, NRHS) + CALL CUNMQR( 'Left', 'Conjugate transpose', + $ M, NRHS, KFACT, A, LDA, TAU, B, LDA, + $ WORK, LWORK_MQR, INFO ) +* + DO I = 1, NRHS +* +* Compare N+J-th column of A and J-column of B. +* + CALL CAXPY( M, -CONE, A( ( N+I-1 )*LDA+1 ), 1, + $ B( ( I-1 )*LDA+1 ), 1 ) + END DO +* + RESULT( 5 ) = + $ ABS( + $ CLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) / + $ ( REAL( M )*SLAMCH( 'Epsilon' ) ) + $ ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 5, 5 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End compute test 5. +* + END IF +* +* END DO KMAX = 1, MIN(M,N)+1 +* + END DO +* +* END DO for INB = 1, NNB +* + END DO +* +* END DO for IMAT = 1, NTYPES +* + END DO +* +* END DO for INS = 1, NNS +* + END DO +* +* END DO for IN = 1, NN +* + END DO +* +* END DO for IM = 1, NM +* + END DO +* +* Print a summary of the results. +* + CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) +* + 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5, + $ ', KMAX =', I5, ', ABSTOL =', G12.5, + $ ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4, + $ ', type ', I2, ', test ', I2, ', ratio =', G12.5 ) +* +* End of CCHKQP3RK +* + END diff --git a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f index 51cef512d..83e8a17b0 100644 --- a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using CHESV_AA. * SRNAMT = 'CHESV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX) CALL CHESV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/clatb4.f b/lapack-netlib/TESTING/LIN/clatb4.f index eeb0f03a9..233a8631a 100644 --- a/lapack-netlib/TESTING/LIN/clatb4.f +++ b/lapack-netlib/TESTING/LIN/clatb4.f @@ -154,9 +154,6 @@ * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. -* .. External Subroutines .. - EXTERNAL SLABAD -* .. * .. Save statement .. SAVE EPS, SMALL, LARGE, BADC1, BADC2, FIRST * .. @@ -174,11 +171,6 @@ BADC1 = SQRT( BADC2 ) SMALL = SLAMCH( 'Safe minimum' ) LARGE = ONE / SMALL -* -* If it looks like we're on a Cray, take the square root of -* SMALL and LARGE to avoid overflow and underflow problems. -* - CALL SLABAD( SMALL, LARGE ) SMALL = SHRINK*( SMALL / EPS ) LARGE = ONE / SMALL END IF @@ -233,6 +225,110 @@ ELSE ANORM = ONE END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* xQK: truncated QR with pivoting. +* Set parameters to generate a general +* M x N matrix. +* +* Set TYPE, the type of matrix to be generated. 'N' is nonsymmetric. +* + TYPE = 'N' +* +* Set DIST, the type of distribution for the random +* number generator. 'S' is +* + DIST = 'S' +* +* Set the lower and upper bandwidths. +* + IF( IMAT.EQ.2 ) THEN +* +* 2. Random, Diagonal, CNDNUM = 2 +* + KL = 0 + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.3 ) THEN +* +* 3. Random, Upper triangular, CNDNUM = 2 +* + KL = 0 + KU = MAX( N-1, 0 ) + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.4 ) THEN +* +* 4. Random, Lower triangular, CNDNUM = 2 +* + KL = MAX( M-1, 0 ) + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE +* +* 5.-19. Rectangular matrix +* + KL = MAX( M-1, 0 ) + KU = MAX( N-1, 0 ) +* + IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN +* +* 5.-14. Random, CNDNUM = 2. +* + CNDNUM = TWO + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.15 ) THEN +* +* 15. Random, CNDNUM = sqrt(0.1/EPS) +* + CNDNUM = BADC1 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.16 ) THEN +* +* 16. Random, CNDNUM = 0.1/EPS +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.17 ) THEN +* +* 17. Random, CNDNUM = 0.1/EPS, +* one small singular value S(N)=1/CNDNUM +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 2 +* + ELSE IF( IMAT.EQ.18 ) THEN +* +* 18. Random, scaled near underflow +* + CNDNUM = TWO + ANORM = SMALL + MODE = 3 +* + ELSE IF( IMAT.EQ.19 ) THEN +* +* 19. Random, scaled near overflow +* + CNDNUM = TWO + ANORM = LARGE + MODE = 3 +* + END IF +* + END IF * ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN * @@ -517,17 +613,18 @@ * * Set the norm and condition number. * - IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN + MAT = ABS( IMAT ) + IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN CNDNUM = BADC1 - ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN + ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN CNDNUM = BADC2 ELSE CNDNUM = TWO END IF * - IF( IMAT.EQ.4 ) THEN + IF( MAT.EQ.4 ) THEN ANORM = SMALL - ELSE IF( IMAT.EQ.5 ) THEN + ELSE IF( MAT.EQ.5 ) THEN ANORM = LARGE ELSE ANORM = ONE diff --git a/lapack-netlib/TESTING/LIN/cqpt01.f b/lapack-netlib/TESTING/LIN/cqpt01.f index 79fc2dc66..149c5bb7c 100644 --- a/lapack-netlib/TESTING/LIN/cqpt01.f +++ b/lapack-netlib/TESTING/LIN/cqpt01.f @@ -33,7 +33,8 @@ *> Householder vectors, and the rest of AF contains a partially updated *> matrix. *> -*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M) +*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) ) +*> where || . || is matrix one norm. *> \endverbatim * * Arguments: @@ -172,28 +173,28 @@ * NORMA = CLANGE( 'One-norm', M, N, A, LDA, RWORK ) * - DO 30 J = 1, K - DO 10 I = 1, MIN( J, M ) + DO J = 1, K + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = AF( I, J ) - 10 CONTINUE - DO 20 I = J + 1, M + END DO + DO I = J + 1, M WORK( ( J-1 )*M+I ) = ZERO - 20 CONTINUE - 30 CONTINUE - DO 40 J = K + 1, N + END DO + END DO + DO J = K + 1, N CALL CCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 ) - 40 CONTINUE + END DO * CALL CUNMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK, $ M, WORK( M*N+1 ), LWORK-M*N, INFO ) * - DO 50 J = 1, N + DO J = 1, N * * Compare i-th column of QR and jpvt(i)-th column of A * CALL CAXPY( M, CMPLX( -ONE ), A( 1, JPVT( J ) ), 1, $ WORK( ( J-1 )*M+1 ), 1 ) - 50 CONTINUE + END DO * CQPT01 = CLANGE( 'One-norm', M, N, WORK, M, RWORK ) / $ ( REAL( MAX( M, N ) )*SLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/cqrt11.f b/lapack-netlib/TESTING/LIN/cqrt11.f index 494d5e9cd..a52084973 100644 --- a/lapack-netlib/TESTING/LIN/cqrt11.f +++ b/lapack-netlib/TESTING/LIN/cqrt11.f @@ -157,9 +157,9 @@ CALL CUNM2R( 'Left', 'Conjugate transpose', M, M, K, A, LDA, TAU, $ WORK, M, WORK( M*M+1 ), INFO ) * - DO 10 J = 1, M + DO J = 1, M WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE - 10 CONTINUE + END DO * CQRT11 = CLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) / $ ( REAL( M )*SLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/cqrt12.f b/lapack-netlib/TESTING/LIN/cqrt12.f index 4c29423ae..0df2d833b 100644 --- a/lapack-netlib/TESTING/LIN/cqrt12.f +++ b/lapack-netlib/TESTING/LIN/cqrt12.f @@ -28,7 +28,7 @@ *> CQRT12 computes the singular values `svlues' of the upper trapezoid *> of A(1:M,1:N) and returns the ratio *> -*> || s - svlues||/(||svlues||*eps*max(M,N)) +*> || svlues -s ||/( ||s||*eps*max(M,N) ) *> \endverbatim * * Arguments: @@ -125,8 +125,8 @@ EXTERNAL CLANGE, SASUM, SLAMCH, SNRM2 * .. * .. External Subroutines .. - EXTERNAL CGEBD2, CLASCL, CLASET, SAXPY, SBDSQR, SLABAD, - $ SLASCL, XERBLA + EXTERNAL CGEBD2, CLASCL, CLASET, SAXPY, SBDSQR, SLASCL, + $ XERBLA * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, REAL @@ -153,17 +153,16 @@ * Copy upper triangle of A into work * CALL CLASET( 'Full', M, N, CMPLX( ZERO ), CMPLX( ZERO ), WORK, M ) - DO 20 J = 1, N - DO 10 I = 1, MIN( J, M ) + DO J = 1, N + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = A( I, J ) - 10 CONTINUE - 20 CONTINUE + END DO + END DO * * Get machine parameters * SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale work if max entry outside range [SMLNUM,BIGNUM] * @@ -207,9 +206,9 @@ * ELSE * - DO 30 I = 1, MN + DO I = 1, MN RWORK( I ) = ZERO - 30 CONTINUE + END DO END IF * * Compare s and singular values of work diff --git a/lapack-netlib/TESTING/LIN/dchkaa.F b/lapack-netlib/TESTING/LIN/dchkaa.F index ef9d7808c..6582cac13 100644 --- a/lapack-netlib/TESTING/LIN/dchkaa.F +++ b/lapack-netlib/TESTING/LIN/dchkaa.F @@ -63,6 +63,7 @@ *> DLQ 8 List types on next line if 0 < NTYPES < 8 *> DQL 8 List types on next line if 0 < NTYPES < 8 *> DQP 6 List types on next line if 0 < NTYPES < 6 +*> DQK 19 List types on next line if 0 < NTYPES < 19 *> DTZ 3 List types on next line if 0 < NTYPES < 3 *> DLS 6 List types on next line if 0 < NTYPES < 6 *> DEQ @@ -149,12 +150,12 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - DOUBLE PRECISION E( NMAX ), S( 2*NMAX ) * .. * .. Allocatable Arrays .. INTEGER AllocateStatus - DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK - DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK, S + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: E + DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK * .. * .. External Functions .. LOGICAL LSAME, LSAMEN @@ -164,13 +165,13 @@ * .. External Subroutines .. EXTERNAL ALAREQ, DCHKEQ, DCHKGB, DCHKGE, DCHKGT, DCHKLQ, $ DCHKORHR_COL, DCHKPB, DCHKPO, DCHKPS, DCHKPP, - $ DCHKPT, DCHKQ3, DCHKQL, DCHKQR, DCHKRQ, DCHKSP, - $ DCHKSY, DCHKSY_ROOK, DCHKSY_RK, DCHKSY_AA, - $ DCHKTB, DCHKTP, DCHKTR, DCHKTZ, DDRVGB, DDRVGE, - $ DDRVGT, DDRVLS, DDRVPB, DDRVPO, DDRVPP, DDRVPT, - $ DDRVSP, DDRVSY, DDRVSY_ROOK, DDRVSY_RK, - $ DDRVSY_AA, ILAVER, DCHKLQTP, DCHKQRT, DCHKQRTP, - $ DCHKLQT,DCHKTSQR + $ DCHKPT, DCHKQ3, DCHKQP3RK, DCHKQL, DCHKQR, + $ DCHKRQ, DCHKSP, DCHKSY, DCHKSY_ROOK, DCHKSY_RK, + $ DCHKSY_AA, DCHKTB, DCHKTP, DCHKTR, DCHKTZ, + $ DDRVGB, DDRVGE, DDRVGT, DDRVLS, DDRVPB, DDRVPO, + $ DDRVPP, DDRVPT, DDRVSP, DDRVSY, DDRVSY_ROOK, + $ DDRVSY_RK, DDRVSY_AA, ILAVER, DCHKLQTP, DCHKQRT, + $ DCHKQRTP, DCHKLQT,DCHKTSQR * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -197,6 +198,10 @@ IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" ALLOCATE ( WORK( NMAX, 3*NMAX+MAXRHS+30 ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( E( NMAX ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" ALLOCATE ( RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * @@ -919,9 +924,26 @@ CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) * IF( TSTCHK ) THEN - CALL DCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ THRESH, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 3 ), WORK, IWORK, NOUT ) + CALL DCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, + $ NXVAL, THRESH, A( 1, 1 ), A( 1, 2 ), + $ B( 1, 1 ), B( 1, 3 ), WORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* QK: truncated QR factorization with pivoting +* + NTYPES = 19 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ), + $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), B( 1, 4 ), + $ WORK, IWORK, NOUT ) ELSE WRITE( NOUT, FMT = 9989 )PATH END IF @@ -1054,6 +1076,8 @@ * DEALLOCATE (A, STAT = AllocateStatus) DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (E, STAT = AllocateStatus) + DEALLOCATE (S, STAT = AllocateStatus) DEALLOCATE (WORK, STAT = AllocateStatus) DEALLOCATE (RWORK, STAT = AllocateStatus) * diff --git a/lapack-netlib/TESTING/LIN/dchkq3.f b/lapack-netlib/TESTING/LIN/dchkq3.f index 1fdf07252..494008fa8 100644 --- a/lapack-netlib/TESTING/LIN/dchkq3.f +++ b/lapack-netlib/TESTING/LIN/dchkq3.f @@ -30,7 +30,7 @@ *> *> \verbatim *> -*> DCHKQ3 tests DGEQP3. +*> DCHKQ3 tests DGEQP3. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/TESTING/LIN/dchkqp3rk.f b/lapack-netlib/TESTING/LIN/dchkqp3rk.f new file mode 100644 index 000000000..434d2067e --- /dev/null +++ b/lapack-netlib/TESTING/LIN/dchkqp3rk.f @@ -0,0 +1,832 @@ +*> \brief \b DCHKQP3RK +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, +* $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, +* $ B, COPYB, S, TAU, +* $ WORK, IWORK, NOUT ) +* IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. +* INTEGER NM, NN, NNS, NNB, NOUT +* DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. +* LOGICAL DOTYPE( * ) +* INTEGER IWORK( * ), MVAL( * ), NBVAL( * ), NSVAL( * ), +* $ NVAL( * ), NXVAL( * ) +* DOUBLE PRECISION A( * ), COPYA( * ), B( * ), COPYB( * ), +* $ S( * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DCHKQP3RK tests DGEQP3RK. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] DOTYPE +*> \verbatim +*> DOTYPE is LOGICAL array, dimension (NTYPES) +*> The matrix types to be used for testing. Matrices of type j +*> (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) = +*> .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used. +*> \endverbatim +*> +*> \param[in] NM +*> \verbatim +*> NM is INTEGER +*> The number of values of M contained in the vector MVAL. +*> \endverbatim +*> +*> \param[in] MVAL +*> \verbatim +*> MVAL is INTEGER array, dimension (NM) +*> The values of the matrix row dimension M. +*> \endverbatim +*> +*> \param[in] NN +*> \verbatim +*> NN is INTEGER +*> The number of values of N contained in the vector NVAL. +*> \endverbatim +*> +*> \param[in] NVAL +*> \verbatim +*> NVAL is INTEGER array, dimension (NN) +*> The values of the matrix column dimension N. +*> \endverbatim +*> +*> \param[in] NNS +*> \verbatim +*> NNS is INTEGER +*> The number of values of NRHS contained in the vector NSVAL. +*> \endverbatim +*> +*> \param[in] NSVAL +*> \verbatim +*> NSVAL is INTEGER array, dimension (NNS) +*> The values of the number of right hand sides NRHS. +*> \endverbatim +*> +*> \param[in] NNB +*> \verbatim +*> NNB is INTEGER +*> The number of values of NB and NX contained in the +*> vectors NBVAL and NXVAL. The blocking parameters are used +*> in pairs (NB,NX). +*> \endverbatim +*> +*> \param[in] NBVAL +*> \verbatim +*> NBVAL is INTEGER array, dimension (NNB) +*> The values of the blocksize NB. +*> \endverbatim +*> +*> \param[in] NXVAL +*> \verbatim +*> NXVAL is INTEGER array, dimension (NNB) +*> The values of the crossover point NX. +*> \endverbatim +*> +*> \param[in] THRESH +*> \verbatim +*> THRESH is DOUBLE PRECISION +*> The threshold value for the test ratios. A result is +*> included in the output file if RESULT >= THRESH. To have +*> every test ratio printed, use THRESH = 0. +*> \endverbatim +*> +*> \param[out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (MMAX*NMAX) +*> where MMAX is the maximum value of M in MVAL and NMAX is the +*> maximum value of N in NVAL. +*> \endverbatim +*> +*> \param[out] COPYA +*> \verbatim +*> COPYA is DOUBLE PRECISION array, dimension (MMAX*NMAX) +*> \endverbatim +*> +*> \param[out] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (MMAX*NSMAX) +*> where MMAX is the maximum value of M in MVAL and NSMAX is the +*> maximum value of NRHS in NSVAL. +*> \endverbatim +*> +*> \param[out] COPYB +*> \verbatim +*> COPYB is DOUBLE PRECISION array, dimension (MMAX*NSMAX) +*> \endverbatim +*> +*> \param[out] S +*> \verbatim +*> S is DOUBLE PRECISION array, dimension +*> (min(MMAX,NMAX)) +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is DOUBLE PRECISION array, dimension (MMAX) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension +*> (MMAX*NMAX + 4*NMAX + MMAX) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (2*NMAX) +*> \endverbatim +*> +*> \param[in] NOUT +*> \verbatim +*> NOUT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup double_lin +* +* ===================================================================== + SUBROUTINE DCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, + $ B, COPYB, S, TAU, + $ WORK, IWORK, NOUT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER NM, NN, NNB, NNS, NOUT + DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. + LOGICAL DOTYPE( * ) + INTEGER IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ), + $ NSVAL( * ), NXVAL( * ) + DOUBLE PRECISION A( * ), COPYA( * ), B( * ), COPYB( * ), + $ S( * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NTYPES + PARAMETER ( NTYPES = 19 ) + INTEGER NTESTS + PARAMETER ( NTESTS = 5 ) + DOUBLE PRECISION ONE, ZERO, BIGNUM + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0, + $ BIGNUM = 1.0D+38 ) +* .. +* .. Local Scalars .. + CHARACTER DIST, TYPE + CHARACTER*3 PATH + INTEGER I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO, + $ INB, IND_OFFSET_GEN, + $ IND_IN, IND_OUT, INS, INFO, + $ ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO, + $ KFACT, KL, KMAX, KU, LDA, LW, LWORK, + $ LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N, + $ NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS, + $ NRUN, NX, T + DOUBLE PRECISION ANORM, CNDNUM, EPS, ABSTOL, RELTOL, + $ DTEMP, MAXC2NRMK, RELMAXC2NRMK +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ), ISEEDY( 4 ) + DOUBLE PRECISION RESULT( NTESTS ), RDUMMY( 1 ) +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH, DQPT01, DQRT11, DQRT12, DLANGE, + $ DLAPY2 + EXTERNAL DLAMCH, DQPT01, DQRT11, DQRT12, DLANGE +* .. +* .. External Subroutines .. + EXTERNAL ALAERH, ALAHD, ALASUM, DAXPY, DGEQP3RK, + $ DLACPY, DLAORD, DLASET, DLATB4, DLATMS, + $ DORMQR, DSWAP, ICOPY, XLAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, MAX, MIN, MOD +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, IOUNIT +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, IOUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEEDY / 1988, 1989, 1990, 1991 / +* .. +* .. Executable Statements .. +* +* Initialize constants and the random number seed. +* + PATH( 1: 1 ) = 'Double precision' + PATH( 2: 3 ) = 'QK' + NRUN = 0 + NFAIL = 0 + NERRS = 0 + DO I = 1, 4 + ISEED( I ) = ISEEDY( I ) + END DO + EPS = DLAMCH( 'Epsilon' ) + INFOT = 0 +* + DO IM = 1, NM +* +* Do for each value of M in MVAL. +* + M = MVAL( IM ) + LDA = MAX( 1, M ) +* + DO IN = 1, NN +* +* Do for each value of N in NVAL. +* + N = NVAL( IN ) + MINMN = MIN( M, N ) + LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ), + $ M*N + 2*MINMN + 4*N ) +* + DO INS = 1, NNS + NRHS = NSVAL( INS ) +* +* Set up parameters with DLATB4 and generate +* M-by-NRHS B matrix with DLATMS. +* IMAT = 14: +* Random matrix, CNDNUM = 2, NORM = ONE, +* MODE = 3 (geometric distribution of singular values). +* + CALL DLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'DLATMS' + CALL DLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYB, LDA, WORK, INFO ) + + +* +* Check error code from DLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'DLATMS', INFO, 0, ' ', M, + $ NRHS, -1, -1, -1, 6, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + DO IMAT = 1, NTYPES +* +* Do the tests only if DOTYPE( IMAT ) is true. +* + IF( .NOT.DOTYPE( IMAT ) ) + $ CYCLE +* +* The type of distribution used to generate the random +* eigen-/singular values: +* ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 ) +* +* Do for each type of NON-SYMMETRIC matrix: CNDNUM NORM MODE +* 1. Zero matrix +* 2. Random, Diagonal, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 3. Random, Upper triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 4. Random, Lower triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 5. Random, First column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 6. Random, Last MINMN column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 7. Random, Last N column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 8. Random, Middle column in MINMN is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 9. Random, First half of MINMN columns are zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 11. Random, Half MINMN columns in the middle are zero starting +* from MINMN/2-(MINMN/2)/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 12. Random, Odd columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 13. Random, Even columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 14. Random, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 15. Random, CNDNUM = sqrt(0.1/EPS) CNDNUM = BADC1 = sqrt(0.1/EPS) ONE 3 ( geometric distribution of singular values ) +* 16. Random, CNDNUM = 0.1/EPS CNDNUM = BADC2 = 0.1/EPS ONE 3 ( geometric distribution of singular values ) +* 17. Random, CNDNUM = 0.1/EPS, CNDNUM = BADC2 = 0.1/EPS ONE 2 ( one small singular value, S(N)=1/CNDNUM ) +* one small singular value S(N)=1/CNDNUM +* 18. Random, CNDNUM = 2, scaled near underflow CNDNUM = 2 SMALL = SAFMIN +* 19. Random, CNDNUM = 2, scaled near overflow CNDNUM = 2 LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) ) 3 ( geometric distribution of singular values ) +* + IF( IMAT.EQ.1 ) THEN +* +* Matrix 1: Zero matrix +* + CALL DLASET( 'Full', M, N, ZERO, ZERO, COPYA, LDA ) + DO I = 1, MINMN + S( I ) = ZERO + END DO +* + ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 ) + $ .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN +* +* Matrices 2-5. +* +* Set up parameters with DLATB4 and generate a test +* matrix with DLATMS. +* + CALL DLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'DLATMS' + CALL DLATMS( M, N, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA, LDA, WORK, INFO ) +* +* Check error code from DLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'DLATMS', INFO, 0, ' ', M, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + CALL DLAORD( 'Decreasing', MINMN, S, 1 ) +* + ELSE IF( MINMN.GE.2 + $ .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN +* +* Rectangular matrices 5-13 that contain zero columns, +* only for matrices MINMN >=2. +* +* JB_ZERO is the column index of ZERO block. +* NB_ZERO is the column block size of ZERO block. +* NB_GEN is the column blcok size of the +* generated block. +* J_INC in the non_zero column index increment +* for matrix 12 and 13. +* J_FIRS_NZ is the index of the first non-zero +* column. +* + IF( IMAT.EQ.5 ) THEN +* +* First column is zero. +* + JB_ZERO = 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.6 ) THEN +* +* Last column MINMN is zero. +* + JB_ZERO = MINMN + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.7 ) THEN +* +* Last column N is zero. +* + JB_ZERO = N + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.8 ) THEN +* +* Middle column in MINMN is zero. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.9 ) THEN +* +* First half of MINMN columns is zero. +* + JB_ZERO = 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.10 ) THEN +* +* Last columns are zero columns, +* starting from (MINMN / 2 + 1) column. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = N - JB_ZERO + 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.11 ) THEN +* +* Half of the columns in the middle of MINMN +* columns is zero, starting from +* MINMN/2 - (MINMN/2)/2 + 1 column. +* + JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.12 ) THEN +* +* Odd-numbered columns are zero, +* + NB_GEN = N / 2 + NB_ZERO = N - NB_GEN + J_INC = 2 + J_FIRST_NZ = 2 +* + ELSE IF( IMAT.EQ.13 ) THEN +* +* Even-numbered columns are zero. +* + NB_ZERO = N / 2 + NB_GEN = N - NB_ZERO + J_INC = 2 + J_FIRST_NZ = 1 +* + END IF +* +* +* 1) Set the first NB_ZERO columns in COPYA(1:M,1:N) +* to zero. +* + CALL DLASET( 'Full', M, NB_ZERO, ZERO, ZERO, + $ COPYA, LDA ) +* +* 2) Generate an M-by-(N-NB_ZERO) matrix with the +* chosen singular value distribution +* in COPYA(1:M,NB_ZERO+1:N). +* + CALL DLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU, + $ ANORM, MODE, CNDNUM, DIST ) +* + SRNAMT = 'DLATMS' +* + IND_OFFSET_GEN = NB_ZERO * LDA +* + CALL DLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA( IND_OFFSET_GEN + 1 ), LDA, + $ WORK, INFO ) +* +* Check error code from DLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'DLATMS', INFO, 0, ' ', M, + $ NB_GEN, -1, -1, -1, IMAT, NFAIL, + $ NERRS, NOUT ) + CYCLE + END IF +* +* 3) Swap the gererated colums from the right side +* NB_GEN-size block in COPYA into correct column +* positions. +* + IF( IMAT.EQ.6 + $ .OR. IMAT.EQ.7 + $ .OR. IMAT.EQ.8 + $ .OR. IMAT.EQ.10 + $ .OR. IMAT.EQ.11 ) THEN +* +* Move by swapping the generated columns +* from the right NB_GEN-size block from +* (NB_ZERO+1:NB_ZERO+JB_ZERO) +* into columns (1:JB_ZERO-1). +* + DO J = 1, JB_ZERO-1, 1 + CALL DSWAP( M, + $ COPYA( ( NB_ZERO+J-1)*LDA+1), 1, + $ COPYA( (J-1)*LDA + 1 ), 1 ) + END DO +* + ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN +* +* ( IMAT = 12, Odd-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the even zero colums in the +* left NB_ZERO-size block. +* +* ( IMAT = 13, Even-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the odd zero colums in the +* left NB_ZERO-size block. +* + DO J = 1, NB_GEN, 1 + IND_OUT = ( NB_ZERO+J-1 )*LDA + 1 + IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA + $ + 1 + CALL DSWAP( M, + $ COPYA( IND_OUT ), 1, + $ COPYA( IND_IN), 1 ) + END DO +* + END IF +* +* 5) Order the singular values generated by +* DLAMTS in decreasing order and add trailing zeros +* that correspond to zero columns. +* The total number of singular values is MINMN. +* + MINMNB_GEN = MIN( M, NB_GEN ) +* + DO I = MINMNB_GEN+1, MINMN + S( I ) = ZERO + END DO +* + ELSE +* +* IF(MINMN.LT.2) skip this size for this matrix type. +* + CYCLE + END IF +* +* Initialize a copy array for a pivot array for DGEQP3RK. +* + DO I = 1, N + IWORK( I ) = 0 + END DO +* + DO INB = 1, NNB +* +* Do for each pair of values (NB,NX) in NBVAL and NXVAL. +* + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) + NX = NXVAL( INB ) + CALL XLAENV( 3, NX ) +* +* We do MIN(M,N)+1 because we need a test for KMAX > N, +* when KMAX is larger than MIN(M,N), KMAX should be +* KMAX = MIN(M,N) +* + DO KMAX = 0, MIN(M,N)+1 +* +* Get a working copy of COPYA into A( 1:M,1:N ). +* Get a working copy of COPYB into A( 1:M, (N+1):NRHS ). +* Get a working copy of COPYB into into B( 1:M, 1:NRHS ). +* Get a working copy of IWORK(1:N) awith zeroes into +* which is going to be used as pivot array IWORK( N+1:2N ). +* NOTE: IWORK(2N+1:3N) is going to be used as a WORK array +* for the routine. +* + CALL DLACPY( 'All', M, N, COPYA, LDA, A, LDA ) + CALL DLACPY( 'All', M, NRHS, COPYB, LDA, + $ A( LDA*N + 1 ), LDA ) + CALL DLACPY( 'All', M, NRHS, COPYB, LDA, + $ B, LDA ) + CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 ) +* + ABSTOL = -1.0 + RELTOL = -1.0 +* +* Compute the QR factorization with pivoting of A +* + LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ), + $ 3*N + NRHS - 1 ) ) +* +* Compute DGEQP3RK factorization of A. +* + SRNAMT = 'DGEQP3RK' + CALL DGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ A, LDA, KFACT, MAXC2NRMK, + $ RELMAXC2NRMK, IWORK( N+1 ), TAU, + $ WORK, LW, IWORK( 2*N+1 ), INFO ) +* +* Check error code from DGEQP3RK. +* + IF( INFO.LT.0 ) + $ CALL ALAERH( PATH, 'DGEQP3RK', INFO, 0, ' ', + $ M, N, NX, -1, NB, IMAT, + $ NFAIL, NERRS, NOUT ) +* +* Compute test 1: +* +* This test in only for the full rank factorization of +* the matrix A. +* +* Array S(1:min(M,N)) contains svd(A) the sigular values +* of the original matrix A in decreasing absolute value +* order. The test computes svd(R), the vector sigular +* values of the upper trapezoid of A(1:M,1:N) that +* contains the factor R, in decreasing order. The test +* returns the ratio: +* +* 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS ) +* + IF( KFACT.EQ.MINMN ) THEN +* + RESULT( 1 ) = DQRT12( M, N, A, LDA, S, WORK, + $ LWORK ) +* + DO T = 1, 1 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, NB, NX, + $ IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 1 +* + END IF +* +* Compute test 2: +* +* The test returns the ratio: +* +* 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS ) +* + RESULT( 2 ) = DQPT01( M, N, KFACT, COPYA, A, LDA, TAU, + $ IWORK( N+1 ), WORK, LWORK ) +* +* Compute test 3: +* +* The test returns the ratio: +* +* 1-norm( Q**T * Q - I ) / ( M * EPS ) +* + RESULT( 3 ) = DQRT11( M, KFACT, A, LDA, TAU, WORK, + $ LWORK ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 2, 3 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 +* +* Compute test 4: +* +* This test is only for the factorizations with the +* rank greater than 2. +* The elements on the diagonal of R should be non- +* increasing. +* +* The test returns the ratio: +* +* Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)), +* K=1:KFACT-1 +* + IF( MIN(KFACT, MINMN).GE.2 ) THEN +* + DO J = 1, KFACT-1, 1 + + DTEMP = (( ABS( A( (J-1)*M+J ) ) - + $ ABS( A( (J)*M+J+1 ) ) ) / + $ ABS( A(1) ) ) +* + IF( DTEMP.LT.ZERO ) THEN + RESULT( 4 ) = BIGNUM + END IF +* + END DO +* +* Print information about the tests that did not +* pass the threshold. +* + DO T = 4, 4 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', + $ M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, + $ RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 4. +* + END IF +* +* Compute test 5: +* +* This test in only for matrix A with min(M,N) > 0. +* +* The test returns the ratio: +* +* 1-norm(Q**T * B - Q**T * B ) / +* ( M * EPS ) +* +* (1) Compute B:=Q**T * B in the matrix B. +* + IF( MINMN.GT.0 ) THEN +* + LWORK_MQR = MAX(1, NRHS) + CALL DORMQR( 'Left', 'Transpose', + $ M, NRHS, KFACT, A, LDA, TAU, B, LDA, + $ WORK, LWORK_MQR, INFO ) +* + DO I = 1, NRHS +* +* Compare N+J-th column of A and J-column of B. +* + CALL DAXPY( M, -ONE, A( ( N+I-1 )*LDA+1 ), 1, + $ B( ( I-1 )*LDA+1 ), 1 ) + END DO +* + RESULT( 5 ) = + $ ABS( + $ DLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) / + $ ( DBLE( M )*DLAMCH( 'Epsilon' ) ) + $ ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 5, 5 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End compute test 5. +* + END IF +* +* END DO KMAX = 1, MIN(M,N)+1 +* + END DO +* +* END DO for INB = 1, NNB +* + END DO +* +* END DO for IMAT = 1, NTYPES +* + END DO +* +* END DO for INS = 1, NNS +* + END DO +* +* END DO for IN = 1, NN +* + END DO +* +* END DO for IM = 1, NM +* + END DO +* +* Print a summary of the results. +* + CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) +* + 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5, + $ ', KMAX =', I5, ', ABSTOL =', G12.5, + $ ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4, + $ ', type ', I2, ', test ', I2, ', ratio =', G12.5 ) +* +* End of DCHKQP3RK +* + END diff --git a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f index bc4e77a5a..1940351a4 100644 --- a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f @@ -421,9 +421,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'DSYTRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL DSYTRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) @@ -503,7 +503,6 @@ c NT = 1 CALL DLACPY( 'Full', N, NRHS, B, LDA, X, LDA ) * SRNAMT = 'DSYTRS_AA_2STAGE' - LWORK = MAX( 1, 3*N-2 ) CALL DSYTRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, $ AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ), $ X, LDA, INFO ) diff --git a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f index 91c9e8e9a..d04106ae3 100644 --- a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using DSYSV_AA. * SRNAMT = 'DSYSV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL DSYSV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/dlatb4.f b/lapack-netlib/TESTING/LIN/dlatb4.f index 8825d13e7..f3bccd45b 100644 --- a/lapack-netlib/TESTING/LIN/dlatb4.f +++ b/lapack-netlib/TESTING/LIN/dlatb4.f @@ -133,7 +133,7 @@ * * .. Parameters .. DOUBLE PRECISION SHRINK, TENTH - PARAMETER ( SHRINK = 0.25D0, TENTH = 0.1D+0 ) + PARAMETER ( SHRINK = 0.25D+0, TENTH = 0.1D+0 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) DOUBLE PRECISION TWO @@ -153,9 +153,6 @@ * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. -* .. External Subroutines .. - EXTERNAL DLABAD -* .. * .. Save statement .. SAVE EPS, SMALL, LARGE, BADC1, BADC2, FIRST * .. @@ -173,11 +170,6 @@ BADC1 = SQRT( BADC2 ) SMALL = DLAMCH( 'Safe minimum' ) LARGE = ONE / SMALL -* -* If it looks like we're on a Cray, take the square root of -* SMALL and LARGE to avoid overflow and underflow problems. -* - CALL DLABAD( SMALL, LARGE ) SMALL = SHRINK*( SMALL / EPS ) LARGE = ONE / SMALL END IF @@ -232,6 +224,110 @@ ELSE ANORM = ONE END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* xQK: truncated QR with pivoting. +* Set parameters to generate a general +* M x N matrix. +* +* Set TYPE, the type of matrix to be generated. 'N' is nonsymmetric. +* + TYPE = 'N' +* +* Set DIST, the type of distribution for the random +* number generator. 'S' is +* + DIST = 'S' +* +* Set the lower and upper bandwidths. +* + IF( IMAT.EQ.2 ) THEN +* +* 2. Random, Diagonal, CNDNUM = 2 +* + KL = 0 + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.3 ) THEN +* +* 3. Random, Upper triangular, CNDNUM = 2 +* + KL = 0 + KU = MAX( N-1, 0 ) + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.4 ) THEN +* +* 4. Random, Lower triangular, CNDNUM = 2 +* + KL = MAX( M-1, 0 ) + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE +* +* 5.-19. Rectangular matrix +* + KL = MAX( M-1, 0 ) + KU = MAX( N-1, 0 ) +* + IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN +* +* 5.-14. Random, CNDNUM = 2. +* + CNDNUM = TWO + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.15 ) THEN +* +* 15. Random, CNDNUM = sqrt(0.1/EPS) +* + CNDNUM = BADC1 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.16 ) THEN +* +* 16. Random, CNDNUM = 0.1/EPS +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.17 ) THEN +* +* 17. Random, CNDNUM = 0.1/EPS, +* one small singular value S(N)=1/CNDNUM +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 2 +* + ELSE IF( IMAT.EQ.18 ) THEN +* +* 18. Random, scaled near underflow +* + CNDNUM = TWO + ANORM = SMALL + MODE = 3 +* + ELSE IF( IMAT.EQ.19 ) THEN +* +* 19. Random, scaled near overflow +* + CNDNUM = TWO + ANORM = LARGE + MODE = 3 +* + END IF +* + END IF * ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN * @@ -518,17 +614,18 @@ * * Set the norm and condition number. * - IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN + MAT = ABS( IMAT ) + IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN CNDNUM = BADC1 - ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN + ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN CNDNUM = BADC2 ELSE CNDNUM = TWO END IF * - IF( IMAT.EQ.4 ) THEN + IF( MAT.EQ.4 ) THEN ANORM = SMALL - ELSE IF( IMAT.EQ.5 ) THEN + ELSE IF( MAT.EQ.5 ) THEN ANORM = LARGE ELSE ANORM = ONE diff --git a/lapack-netlib/TESTING/LIN/dqpt01.f b/lapack-netlib/TESTING/LIN/dqpt01.f index 8efbdc774..af3f5dd36 100644 --- a/lapack-netlib/TESTING/LIN/dqpt01.f +++ b/lapack-netlib/TESTING/LIN/dqpt01.f @@ -28,12 +28,13 @@ *> *> DQPT01 tests the QR-factorization with pivoting of a matrix A. The *> array AF contains the (possibly partial) QR-factorization of A, where -*> the upper triangle of AF(1:k,1:k) is a partial triangular factor, -*> the entries below the diagonal in the first k columns are the +*> the upper triangle of AF(1:K,1:K) is a partial triangular factor, +*> the entries below the diagonal in the first K columns are the *> Householder vectors, and the rest of AF contains a partially updated *> matrix. *> -*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M) +*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) ), +*> where || . || is matrix one norm. *> \endverbatim * * Arguments: @@ -172,28 +173,41 @@ * NORMA = DLANGE( 'One-norm', M, N, A, LDA, RWORK ) * - DO 30 J = 1, K - DO 10 I = 1, MIN( J, M ) + DO J = 1, K +* +* Copy the upper triangular part of the factor R stored +* in AF(1:K,1:K) into the work array WORK. +* + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = AF( I, J ) - 10 CONTINUE - DO 20 I = J + 1, M + END DO +* +* Zero out the elements below the diagonal in the work array. +* + DO I = J + 1, M WORK( ( J-1 )*M+I ) = ZERO - 20 CONTINUE - 30 CONTINUE - DO 40 J = K + 1, N + END DO + END DO +* +* Copy columns (K+1,N) from AF into the work array WORK. +* AF(1:K,K+1:N) contains the rectangular block of the upper trapezoidal +* factor R, AF(K+1:M,K+1:N) contains the partially updated residual +* matrix of R. +* + DO J = K + 1, N CALL DCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 ) - 40 CONTINUE + END DO * CALL DORMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK, $ M, WORK( M*N+1 ), LWORK-M*N, INFO ) * - DO 50 J = 1, N + DO J = 1, N * -* Compare i-th column of QR and jpvt(i)-th column of A +* Compare J-th column of QR and JPVT(J)-th column of A. * CALL DAXPY( M, -ONE, A( 1, JPVT( J ) ), 1, WORK( ( J-1 )*M+1 ), $ 1 ) - 50 CONTINUE + END DO * DQPT01 = DLANGE( 'One-norm', M, N, WORK, M, RWORK ) / $ ( DBLE( MAX( M, N ) )*DLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/dqrt11.f b/lapack-netlib/TESTING/LIN/dqrt11.f index 33c7fab37..38bbeb822 100644 --- a/lapack-netlib/TESTING/LIN/dqrt11.f +++ b/lapack-netlib/TESTING/LIN/dqrt11.f @@ -157,9 +157,9 @@ CALL DORM2R( 'Left', 'Transpose', M, M, K, A, LDA, TAU, WORK, M, $ WORK( M*M+1 ), INFO ) * - DO 10 J = 1, M + DO J = 1, M WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE - 10 CONTINUE + END DO * DQRT11 = DLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) / $ ( DBLE( M )*DLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/dqrt12.f b/lapack-netlib/TESTING/LIN/dqrt12.f index 278e01bf0..b8a124c59 100644 --- a/lapack-netlib/TESTING/LIN/dqrt12.f +++ b/lapack-netlib/TESTING/LIN/dqrt12.f @@ -26,7 +26,7 @@ *> DQRT12 computes the singular values `svlues' of the upper trapezoid *> of A(1:M,1:N) and returns the ratio *> -*> || s - svlues||/(||svlues||*eps*max(M,N)) +*> || svlues - s ||/(||s||*eps*max(M,N)) *> \endverbatim * * Arguments: @@ -113,8 +113,7 @@ EXTERNAL DASUM, DLAMCH, DLANGE, DNRM2 * .. * .. External Subroutines .. - EXTERNAL DAXPY, DBDSQR, DGEBD2, DLABAD, DLASCL, DLASET, - $ XERBLA + EXTERNAL DAXPY, DBDSQR, DGEBD2, DLASCL, DLASET, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN @@ -145,17 +144,16 @@ * Copy upper triangle of A into work * CALL DLASET( 'Full', M, N, ZERO, ZERO, WORK, M ) - DO 20 J = 1, N - DO 10 I = 1, MIN( J, M ) + DO J = 1, N + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = A( I, J ) - 10 CONTINUE - 20 CONTINUE + END DO + END DO * * Get machine parameters * SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale work if max entry outside range [SMLNUM,BIGNUM] * @@ -199,16 +197,18 @@ * ELSE * - DO 30 I = 1, MN + DO I = 1, MN WORK( M*N+I ) = ZERO - 30 CONTINUE + END DO END IF * * Compare s and singular values of work * CALL DAXPY( MN, -ONE, S, 1, WORK( M*N+1 ), 1 ) +* DQRT12 = DASUM( MN, WORK( M*N+1 ), 1 ) / - $ ( DLAMCH( 'Epsilon' )*DBLE( MAX( M, N ) ) ) + $ ( DLAMCH('Epsilon') * DBLE( MAX( M, N ) ) ) +* IF( NRMSVL.NE.ZERO ) $ DQRT12 = DQRT12 / NRMSVL * diff --git a/lapack-netlib/TESTING/LIN/schkaa.F b/lapack-netlib/TESTING/LIN/schkaa.F index a5b826d06..036b13924 100644 --- a/lapack-netlib/TESTING/LIN/schkaa.F +++ b/lapack-netlib/TESTING/LIN/schkaa.F @@ -63,6 +63,7 @@ *> SLQ 8 List types on next line if 0 < NTYPES < 8 *> SQL 8 List types on next line if 0 < NTYPES < 8 *> SQP 6 List types on next line if 0 < NTYPES < 6 +*> DQK 19 List types on next line if 0 < NTYPES < 19 *> STZ 3 List types on next line if 0 < NTYPES < 3 *> SLS 6 List types on next line if 0 < NTYPES < 6 *> SEQ @@ -147,11 +148,11 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - REAL E( NMAX ), S( 2*NMAX ) * .. * .. Allocatable Arrays .. INTEGER AllocateStatus - REAL, DIMENSION(:), ALLOCATABLE :: RWORK + REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S + REAL, DIMENSION(:), ALLOCATABLE :: E REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK * .. * .. External Functions .. @@ -162,13 +163,13 @@ * .. External Subroutines .. EXTERNAL ALAREQ, SCHKEQ, SCHKGB, SCHKGE, SCHKGT, SCHKLQ, $ SCHKORHR_COL, SCHKPB, SCHKPO, SCHKPS, SCHKPP, - $ SCHKPT, SCHKQ3, SCHKQL, SCHKQR, SCHKRQ, SCHKSP, - $ SCHKSY, SCHKSY_ROOK, SCHKSY_RK, SCHKSY_AA, - $ SCHKTB, SCHKTP, SCHKTR, SCHKTZ, SDRVGB, SDRVGE, - $ SDRVGT, SDRVLS, SDRVPB, SDRVPO, SDRVPP, SDRVPT, - $ SDRVSP, SDRVSY, SDRVSY_ROOK, SDRVSY_RK, - $ SDRVSY_AA, ILAVER, SCHKLQTP, SCHKQRT, SCHKQRTP, - $ SCHKLQT, SCHKTSQR + $ SCHKPT, SCHKQ3, SCHKQP3RK, SCHKQL, SCHKQR, + $ SCHKRQ, SCHKSP, SCHKSY, SCHKSY_ROOK, SCHKSY_RK, + $ SCHKSY_AA, SCHKTB, SCHKTP, SCHKTR, SCHKTZ, + $ SDRVGB, SDRVGE, SDRVGT, SDRVLS, SDRVPB, SDRVPO, + $ SDRVPP, SDRVPT, SDRVSP, SDRVSY, SDRVSY_ROOK, + $ SDRVSY_RK, SDRVSY_AA, ILAVER, SCHKLQTP, SCHKQRT, + $ SCHKQRTP, SCHKLQT, SCHKTSQR * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -188,13 +189,17 @@ * .. * .. Allocate memory dynamically .. * - ALLOCATE (A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus ) + ALLOCATE ( A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" - ALLOCATE (B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus ) + ALLOCATE ( B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" - ALLOCATE (WORK( NMAX, NMAX+MAXRHS+30 ) , STAT = AllocateStatus ) + ALLOCATE ( WORK( NMAX, 3*NMAX+MAXRHS+30 ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" - ALLOCATE (RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus ) + ALLOCATE ( E( NMAX ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * .. * .. Executable Statements .. @@ -920,6 +925,23 @@ ELSE WRITE( NOUT, FMT = 9989 )PATH END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* QK: truncated QR factorization with pivoting +* + NTYPES = 19 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ), + $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), B( 1, 4 ), + $ WORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF * ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN * @@ -1048,6 +1070,8 @@ * DEALLOCATE (A, STAT = AllocateStatus) DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (E, STAT = AllocateStatus) + DEALLOCATE (S, STAT = AllocateStatus) DEALLOCATE (WORK, STAT = AllocateStatus) DEALLOCATE (RWORK, STAT = AllocateStatus) * diff --git a/lapack-netlib/TESTING/LIN/schkqp3rk.f b/lapack-netlib/TESTING/LIN/schkqp3rk.f new file mode 100644 index 000000000..36cf9370e --- /dev/null +++ b/lapack-netlib/TESTING/LIN/schkqp3rk.f @@ -0,0 +1,831 @@ +*> \brief \b SCHKQP3RK +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, +* $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, +* $ B, COPYB, S, TAU, +* $ WORK, IWORK, NOUT ) +* IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. +* INTEGER NM, NN, NNS, NNB, NOUT +* REAL THRESH +* .. +* .. Array Arguments .. +* LOGICAL DOTYPE( * ) +* INTEGER IWORK( * ), MVAL( * ), NBVAL( * ), NSVAL( * ), +* $ NVAL( * ), NXVAL( * ) +* REAL A( * ), COPYA( * ), B( * ), COPYB( * ), +* $ S( * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SCHKQP3RK tests SGEQP3RK. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] DOTYPE +*> \verbatim +*> DOTYPE is LOGICAL array, dimension (NTYPES) +*> The matrix types to be used for testing. Matrices of type j +*> (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) = +*> .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used. +*> \endverbatim +*> +*> \param[in] NM +*> \verbatim +*> NM is INTEGER +*> The number of values of M contained in the vector MVAL. +*> \endverbatim +*> +*> \param[in] MVAL +*> \verbatim +*> MVAL is INTEGER array, dimension (NM) +*> The values of the matrix row dimension M. +*> \endverbatim +*> +*> \param[in] NN +*> \verbatim +*> NN is INTEGER +*> The number of values of N contained in the vector NVAL. +*> \endverbatim +*> +*> \param[in] NVAL +*> \verbatim +*> NVAL is INTEGER array, dimension (NN) +*> The values of the matrix column dimension N. +*> \endverbatim +*> +*> \param[in] NNS +*> \verbatim +*> NNS is INTEGER +*> The number of values of NRHS contained in the vector NSVAL. +*> \endverbatim +*> +*> \param[in] NSVAL +*> \verbatim +*> NSVAL is INTEGER array, dimension (NNS) +*> The values of the number of right hand sides NRHS. +*> \endverbatim +*> +*> \param[in] NNB +*> \verbatim +*> NNB is INTEGER +*> The number of values of NB and NX contained in the +*> vectors NBVAL and NXVAL. The blocking parameters are used +*> in pairs (NB,NX). +*> \endverbatim +*> +*> \param[in] NBVAL +*> \verbatim +*> NBVAL is INTEGER array, dimension (NNB) +*> The values of the blocksize NB. +*> \endverbatim +*> +*> \param[in] NXVAL +*> \verbatim +*> NXVAL is INTEGER array, dimension (NNB) +*> The values of the crossover point NX. +*> \endverbatim +*> +*> \param[in] THRESH +*> \verbatim +*> THRESH is REAL +*> The threshold value for the test ratios. A result is +*> included in the output file if RESULT >= THRESH. To have +*> every test ratio printed, use THRESH = 0. +*> \endverbatim +*> +*> \param[out] A +*> \verbatim +*> A is REAL array, dimension (MMAX*NMAX) +*> where MMAX is the maximum value of M in MVAL and NMAX is the +*> maximum value of N in NVAL. +*> \endverbatim +*> +*> \param[out] COPYA +*> \verbatim +*> COPYA is REAL array, dimension (MMAX*NMAX) +*> \endverbatim +*> +*> \param[out] B +*> \verbatim +*> B is REAL array, dimension (MMAX*NSMAX) +*> where MMAX is the maximum value of M in MVAL and NSMAX is the +*> maximum value of NRHS in NSVAL. +*> \endverbatim +*> +*> \param[out] COPYB +*> \verbatim +*> COPYB is REAL array, dimension (MMAX*NSMAX) +*> \endverbatim +*> +*> \param[out] S +*> \verbatim +*> S is REAL array, dimension +*> (min(MMAX,NMAX)) +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is REAL array, dimension (MMAX) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension +*> (MMAX*NMAX + 4*NMAX + MMAX) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (2*NMAX) +*> \endverbatim +*> +*> \param[in] NOUT +*> \verbatim +*> NOUT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup single_lin +* +* ===================================================================== + SUBROUTINE SCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, + $ B, COPYB, S, TAU, + $ WORK, IWORK, NOUT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER NM, NN, NNB, NNS, NOUT + REAL THRESH +* .. +* .. Array Arguments .. + LOGICAL DOTYPE( * ) + INTEGER IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ), + $ NSVAL( * ), NXVAL( * ) + REAL A( * ), COPYA( * ), B( * ), COPYB( * ), + $ S( * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NTYPES + PARAMETER ( NTYPES = 19 ) + INTEGER NTESTS + PARAMETER ( NTESTS = 5 ) + REAL ONE, ZERO, BIGNUM + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0, + $ BIGNUM = 1.0E+38 ) +* .. +* .. Local Scalars .. + CHARACTER DIST, TYPE + CHARACTER*3 PATH + INTEGER I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO, + $ INB, IND_OFFSET_GEN, + $ IND_IN, IND_OUT, INS, INFO, + $ ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO, + $ KFACT, KL, KMAX, KU, LDA, LW, LWORK, + $ LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N, + $ NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS, + $ NRUN, NX, T + REAL ANORM, CNDNUM, EPS, ABSTOL, RELTOL, + $ DTEMP, MAXC2NRMK, RELMAXC2NRMK +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ), ISEEDY( 4 ) + REAL RESULT( NTESTS ), RDUMMY( 1 ) +* .. +* .. External Functions .. + REAL SLAMCH, SQPT01, SQRT11, SQRT12, SLANGE + EXTERNAL SLAMCH, SQPT01, SQRT11, SQRT12, SLANGE +* .. +* .. External Subroutines .. + EXTERNAL ALAERH, ALAHD, ALASUM, SAXPY, SGEQP3RK, + $ SLACPY, SLAORD, SLASET, SLATB4, SLATMS, + $ SORMQR, SSWAP, ICOPY, XLAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, MOD, REAL +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, IOUNIT +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, IOUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEEDY / 1988, 1989, 1990, 1991 / +* .. +* .. Executable Statements .. +* +* Initialize constants and the random number seed. +* + PATH( 1: 1 ) = 'Single precision' + PATH( 2: 3 ) = 'QK' + NRUN = 0 + NFAIL = 0 + NERRS = 0 + DO I = 1, 4 + ISEED( I ) = ISEEDY( I ) + END DO + EPS = SLAMCH( 'Epsilon' ) + INFOT = 0 +* + DO IM = 1, NM +* +* Do for each value of M in MVAL. +* + M = MVAL( IM ) + LDA = MAX( 1, M ) +* + DO IN = 1, NN +* +* Do for each value of N in NVAL. +* + N = NVAL( IN ) + MINMN = MIN( M, N ) + LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ), + $ M*N + 2*MINMN + 4*N ) +* + DO INS = 1, NNS + NRHS = NSVAL( INS ) +* +* Set up parameters with SLATB4 and generate +* M-by-NRHS B matrix with SLATMS. +* IMAT = 14: +* Random matrix, CNDNUM = 2, NORM = ONE, +* MODE = 3 (geometric distribution of singular values). +* + CALL SLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'SLATMS' + CALL SLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYB, LDA, WORK, INFO ) + + +* +* Check error code from SLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'SLATMS', INFO, 0, ' ', M, + $ NRHS, -1, -1, -1, 6, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + DO IMAT = 1, NTYPES +* +* Do the tests only if DOTYPE( IMAT ) is true. +* + IF( .NOT.DOTYPE( IMAT ) ) + $ CYCLE +* +* The type of distribution used to generate the random +* eigen-/singular values: +* ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 ) +* +* Do for each type of NON-SYMMETRIC matrix: CNDNUM NORM MODE +* 1. Zero matrix +* 2. Random, Diagonal, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 3. Random, Upper triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 4. Random, Lower triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 5. Random, First column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 6. Random, Last MINMN column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 7. Random, Last N column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 8. Random, Middle column in MINMN is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 9. Random, First half of MINMN columns are zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 11. Random, Half MINMN columns in the middle are zero starting +* from MINMN/2-(MINMN/2)/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 12. Random, Odd columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 13. Random, Even columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 14. Random, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 15. Random, CNDNUM = sqrt(0.1/EPS) CNDNUM = BADC1 = sqrt(0.1/EPS) ONE 3 ( geometric distribution of singular values ) +* 16. Random, CNDNUM = 0.1/EPS CNDNUM = BADC2 = 0.1/EPS ONE 3 ( geometric distribution of singular values ) +* 17. Random, CNDNUM = 0.1/EPS, CNDNUM = BADC2 = 0.1/EPS ONE 2 ( one small singular value, S(N)=1/CNDNUM ) +* one small singular value S(N)=1/CNDNUM +* 18. Random, CNDNUM = 2, scaled near underflow CNDNUM = 2 SMALL = SAFMIN +* 19. Random, CNDNUM = 2, scaled near overflow CNDNUM = 2 LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) ) 3 ( geometric distribution of singular values ) +* + IF( IMAT.EQ.1 ) THEN +* +* Matrix 1: Zero matrix +* + CALL SLASET( 'Full', M, N, ZERO, ZERO, COPYA, LDA ) + DO I = 1, MINMN + S( I ) = ZERO + END DO +* + ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 ) + $ .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN +* +* Matrices 2-5. +* +* Set up parameters with SLATB4 and generate a test +* matrix with SLATMS. +* + CALL SLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'SLATMS' + CALL SLATMS( M, N, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA, LDA, WORK, INFO ) +* +* Check error code from SLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'SLATMS', INFO, 0, ' ', M, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + CALL SLAORD( 'Decreasing', MINMN, S, 1 ) +* + ELSE IF( MINMN.GE.2 + $ .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN +* +* Rectangular matrices 5-13 that contain zero columns, +* only for matrices MINMN >=2. +* +* JB_ZERO is the column index of ZERO block. +* NB_ZERO is the column block size of ZERO block. +* NB_GEN is the column blcok size of the +* generated block. +* J_INC in the non_zero column index increment +* for matrix 12 and 13. +* J_FIRS_NZ is the index of the first non-zero +* column. +* + IF( IMAT.EQ.5 ) THEN +* +* First column is zero. +* + JB_ZERO = 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.6 ) THEN +* +* Last column MINMN is zero. +* + JB_ZERO = MINMN + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.7 ) THEN +* +* Last column N is zero. +* + JB_ZERO = N + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.8 ) THEN +* +* Middle column in MINMN is zero. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.9 ) THEN +* +* First half of MINMN columns is zero. +* + JB_ZERO = 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.10 ) THEN +* +* Last columns are zero columns, +* starting from (MINMN / 2 + 1) column. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = N - JB_ZERO + 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.11 ) THEN +* +* Half of the columns in the middle of MINMN +* columns is zero, starting from +* MINMN/2 - (MINMN/2)/2 + 1 column. +* + JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.12 ) THEN +* +* Odd-numbered columns are zero, +* + NB_GEN = N / 2 + NB_ZERO = N - NB_GEN + J_INC = 2 + J_FIRST_NZ = 2 +* + ELSE IF( IMAT.EQ.13 ) THEN +* +* Even-numbered columns are zero. +* + NB_ZERO = N / 2 + NB_GEN = N - NB_ZERO + J_INC = 2 + J_FIRST_NZ = 1 +* + END IF +* +* +* 1) Set the first NB_ZERO columns in COPYA(1:M,1:N) +* to zero. +* + CALL SLASET( 'Full', M, NB_ZERO, ZERO, ZERO, + $ COPYA, LDA ) +* +* 2) Generate an M-by-(N-NB_ZERO) matrix with the +* chosen singular value distribution +* in COPYA(1:M,NB_ZERO+1:N). +* + CALL SLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU, + $ ANORM, MODE, CNDNUM, DIST ) +* + SRNAMT = 'SLATMS' +* + IND_OFFSET_GEN = NB_ZERO * LDA +* + CALL SLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA( IND_OFFSET_GEN + 1 ), LDA, + $ WORK, INFO ) +* +* Check error code from SLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'SLATMS', INFO, 0, ' ', M, + $ NB_GEN, -1, -1, -1, IMAT, NFAIL, + $ NERRS, NOUT ) + CYCLE + END IF +* +* 3) Swap the gererated colums from the right side +* NB_GEN-size block in COPYA into correct column +* positions. +* + IF( IMAT.EQ.6 + $ .OR. IMAT.EQ.7 + $ .OR. IMAT.EQ.8 + $ .OR. IMAT.EQ.10 + $ .OR. IMAT.EQ.11 ) THEN +* +* Move by swapping the generated columns +* from the right NB_GEN-size block from +* (NB_ZERO+1:NB_ZERO+JB_ZERO) +* into columns (1:JB_ZERO-1). +* + DO J = 1, JB_ZERO-1, 1 + CALL SSWAP( M, + $ COPYA( ( NB_ZERO+J-1)*LDA+1), 1, + $ COPYA( (J-1)*LDA + 1 ), 1 ) + END DO +* + ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN +* +* ( IMAT = 12, Odd-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the even zero colums in the +* left NB_ZERO-size block. +* +* ( IMAT = 13, Even-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the odd zero colums in the +* left NB_ZERO-size block. +* + DO J = 1, NB_GEN, 1 + IND_OUT = ( NB_ZERO+J-1 )*LDA + 1 + IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA + $ + 1 + CALL SSWAP( M, + $ COPYA( IND_OUT ), 1, + $ COPYA( IND_IN), 1 ) + END DO +* + END IF +* +* 5) Order the singular values generated by +* DLAMTS in decreasing order and add trailing zeros +* that correspond to zero columns. +* The total number of singular values is MINMN. +* + MINMNB_GEN = MIN( M, NB_GEN ) +* + DO I = MINMNB_GEN+1, MINMN + S( I ) = ZERO + END DO +* + ELSE +* +* IF(MINMN.LT.2) skip this size for this matrix type. +* + CYCLE + END IF +* +* Initialize a copy array for a pivot array for SGEQP3RK. +* + DO I = 1, N + IWORK( I ) = 0 + END DO +* + DO INB = 1, NNB +* +* Do for each pair of values (NB,NX) in NBVAL and NXVAL. +* + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) + NX = NXVAL( INB ) + CALL XLAENV( 3, NX ) +* +* We do MIN(M,N)+1 because we need a test for KMAX > N, +* when KMAX is larger than MIN(M,N), KMAX should be +* KMAX = MIN(M,N) +* + DO KMAX = 0, MIN(M,N)+1 +* +* Get a working copy of COPYA into A( 1:M,1:N ). +* Get a working copy of COPYB into A( 1:M, (N+1):NRHS ). +* Get a working copy of COPYB into into B( 1:M, 1:NRHS ). +* Get a working copy of IWORK(1:N) awith zeroes into +* which is going to be used as pivot array IWORK( N+1:2N ). +* NOTE: IWORK(2N+1:3N) is going to be used as a WORK array +* for the routine. +* + CALL SLACPY( 'All', M, N, COPYA, LDA, A, LDA ) + CALL SLACPY( 'All', M, NRHS, COPYB, LDA, + $ A( LDA*N + 1 ), LDA ) + CALL SLACPY( 'All', M, NRHS, COPYB, LDA, + $ B, LDA ) + CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 ) +* + ABSTOL = -1.0 + RELTOL = -1.0 +* +* Compute the QR factorization with pivoting of A +* + LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ), + $ 3*N + NRHS - 1 ) ) +* +* Compute SGEQP3RK factorization of A. +* + SRNAMT = 'SGEQP3RK' + CALL SGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ A, LDA, KFACT, MAXC2NRMK, + $ RELMAXC2NRMK, IWORK( N+1 ), TAU, + $ WORK, LW, IWORK( 2*N+1 ), INFO ) +* +* Check error code from SGEQP3RK. +* + IF( INFO.LT.0 ) + $ CALL ALAERH( PATH, 'SGEQP3RK', INFO, 0, ' ', + $ M, N, NX, -1, NB, IMAT, + $ NFAIL, NERRS, NOUT ) +* +* Compute test 1: +* +* This test in only for the full rank factorization of +* the matrix A. +* +* Array S(1:min(M,N)) contains svd(A) the sigular values +* of the original matrix A in decreasing absolute value +* order. The test computes svd(R), the vector sigular +* values of the upper trapezoid of A(1:M,1:N) that +* contains the factor R, in decreasing order. The test +* returns the ratio: +* +* 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS ) +* + IF( KFACT.EQ.MINMN ) THEN +* + RESULT( 1 ) = SQRT12( M, N, A, LDA, S, WORK, + $ LWORK ) +* + DO T = 1, 1 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, NB, NX, + $ IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 1 +* + END IF +* +* Compute test 2: +* +* The test returns the ratio: +* +* 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS ) +* + RESULT( 2 ) = SQPT01( M, N, KFACT, COPYA, A, LDA, TAU, + $ IWORK( N+1 ), WORK, LWORK ) +* +* Compute test 3: +* +* The test returns the ratio: +* +* 1-norm( Q**T * Q - I ) / ( M * EPS ) +* + RESULT( 3 ) = SQRT11( M, KFACT, A, LDA, TAU, WORK, + $ LWORK ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 2, 3 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 +* +* Compute test 4: +* +* This test is only for the factorizations with the +* rank greater than 2. +* The elements on the diagonal of R should be non- +* increasing. +* +* The test returns the ratio: +* +* Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)), +* K=1:KFACT-1 +* + IF( MIN(KFACT, MINMN).GE.2 ) THEN +* + DO J = 1, KFACT-1, 1 + + DTEMP = (( ABS( A( (J-1)*M+J ) ) - + $ ABS( A( (J)*M+J+1 ) ) ) / + $ ABS( A(1) ) ) +* + IF( DTEMP.LT.ZERO ) THEN + RESULT( 4 ) = BIGNUM + END IF +* + END DO +* +* Print information about the tests that did not +* pass the threshold. +* + DO T = 4, 4 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', + $ M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, + $ RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 4. +* + END IF +* +* Compute test 5: +* +* This test in only for matrix A with min(M,N) > 0. +* +* The test returns the ratio: +* +* 1-norm(Q**T * B - Q**T * B ) / +* ( M * EPS ) +* +* (1) Compute B:=Q**T * B in the matrix B. +* + IF( MINMN.GT.0 ) THEN +* + LWORK_MQR = MAX(1, NRHS) + CALL SORMQR( 'Left', 'Transpose', + $ M, NRHS, KFACT, A, LDA, TAU, B, LDA, + $ WORK, LWORK_MQR, INFO ) +* + DO I = 1, NRHS +* +* Compare N+J-th column of A and J-column of B. +* + CALL SAXPY( M, -ONE, A( ( N+I-1 )*LDA+1 ), 1, + $ B( ( I-1 )*LDA+1 ), 1 ) + END DO +* + RESULT( 5 ) = + $ ABS( + $ SLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) / + $ ( REAL( M )*SLAMCH( 'Epsilon' ) ) + $ ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 5, 5 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End compute test 5. +* + END IF +* +* END DO KMAX = 1, MIN(M,N)+1 +* + END DO +* +* END DO for INB = 1, NNB +* + END DO +* +* END DO for IMAT = 1, NTYPES +* + END DO +* +* END DO for INS = 1, NNS +* + END DO +* +* END DO for IN = 1, NN +* + END DO +* +* END DO for IM = 1, NM +* + END DO +* +* Print a summary of the results. +* + CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) +* + 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5, + $ ', KMAX =', I5, ', ABSTOL =', G12.5, + $ ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4, + $ ', type ', I2, ', test ', I2, ', ratio =', G12.5 ) +* +* End of SCHKQP3RK +* + END diff --git a/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f index d3c27ae56..6490cd7c3 100644 --- a/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f @@ -423,9 +423,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'SSYTRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL SSYTRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) @@ -505,7 +505,6 @@ CALL SLACPY( 'Full', N, NRHS, B, LDA, X, LDA ) * SRNAMT = 'SSYTRS_AA_2STAGE' - LWORK = MAX( 1, 3*N-2 ) CALL SSYTRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, $ AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ), $ X, LDA, INFO ) diff --git a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f index aff32bce9..319b90805 100644 --- a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using SSYSV_AA. * SRNAMT = 'SSYSV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL SSYSV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/slatb4.f b/lapack-netlib/TESTING/LIN/slatb4.f index 94d29db40..72a310727 100644 --- a/lapack-netlib/TESTING/LIN/slatb4.f +++ b/lapack-netlib/TESTING/LIN/slatb4.f @@ -153,9 +153,6 @@ * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. -* .. External Subroutines .. - EXTERNAL SLABAD -* .. * .. Save statement .. SAVE EPS, SMALL, LARGE, BADC1, BADC2, FIRST * .. @@ -173,11 +170,6 @@ BADC1 = SQRT( BADC2 ) SMALL = SLAMCH( 'Safe minimum' ) LARGE = ONE / SMALL -* -* If it looks like we're on a Cray, take the square root of -* SMALL and LARGE to avoid overflow and underflow problems. -* - CALL SLABAD( SMALL, LARGE ) SMALL = SHRINK*( SMALL / EPS ) LARGE = ONE / SMALL END IF @@ -232,6 +224,110 @@ ELSE ANORM = ONE END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* xQK: truncated QR with pivoting. +* Set parameters to generate a general +* M x N matrix. +* +* Set TYPE, the type of matrix to be generated. 'N' is nonsymmetric. +* + TYPE = 'N' +* +* Set DIST, the type of distribution for the random +* number generator. 'S' is +* + DIST = 'S' +* +* Set the lower and upper bandwidths. +* + IF( IMAT.EQ.2 ) THEN +* +* 2. Random, Diagonal, CNDNUM = 2 +* + KL = 0 + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.3 ) THEN +* +* 3. Random, Upper triangular, CNDNUM = 2 +* + KL = 0 + KU = MAX( N-1, 0 ) + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.4 ) THEN +* +* 4. Random, Lower triangular, CNDNUM = 2 +* + KL = MAX( M-1, 0 ) + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE +* +* 5.-19. Rectangular matrix +* + KL = MAX( M-1, 0 ) + KU = MAX( N-1, 0 ) +* + IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN +* +* 5.-14. Random, CNDNUM = 2. +* + CNDNUM = TWO + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.15 ) THEN +* +* 15. Random, CNDNUM = sqrt(0.1/EPS) +* + CNDNUM = BADC1 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.16 ) THEN +* +* 16. Random, CNDNUM = 0.1/EPS +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.17 ) THEN +* +* 17. Random, CNDNUM = 0.1/EPS, +* one small singular value S(N)=1/CNDNUM +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 2 +* + ELSE IF( IMAT.EQ.18 ) THEN +* +* 18. Random, scaled near underflow +* + CNDNUM = TWO + ANORM = SMALL + MODE = 3 +* + ELSE IF( IMAT.EQ.19 ) THEN +* +* 19. Random, scaled near overflow +* + CNDNUM = TWO + ANORM = LARGE + MODE = 3 +* + END IF +* + END IF * ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN * @@ -518,17 +614,18 @@ * * Set the norm and condition number. * - IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN + MAT = ABS( IMAT ) + IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN CNDNUM = BADC1 - ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN + ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN CNDNUM = BADC2 ELSE CNDNUM = TWO END IF * - IF( IMAT.EQ.4 ) THEN + IF( MAT.EQ.4 ) THEN ANORM = SMALL - ELSE IF( IMAT.EQ.5 ) THEN + ELSE IF( MAT.EQ.5 ) THEN ANORM = LARGE ELSE ANORM = ONE diff --git a/lapack-netlib/TESTING/LIN/sqpt01.f b/lapack-netlib/TESTING/LIN/sqpt01.f index de0c80e53..f53686a65 100644 --- a/lapack-netlib/TESTING/LIN/sqpt01.f +++ b/lapack-netlib/TESTING/LIN/sqpt01.f @@ -33,7 +33,8 @@ *> Householder vectors, and the rest of AF contains a partially updated *> matrix. *> -*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M) +*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) ) +*> where || . || is matrix one norm. *> \endverbatim * * Arguments: @@ -172,28 +173,28 @@ * NORMA = SLANGE( 'One-norm', M, N, A, LDA, RWORK ) * - DO 30 J = 1, K - DO 10 I = 1, MIN( J, M ) + DO J = 1, K + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = AF( I, J ) - 10 CONTINUE - DO 20 I = J + 1, M + END DO + DO I = J + 1, M WORK( ( J-1 )*M+I ) = ZERO - 20 CONTINUE - 30 CONTINUE - DO 40 J = K + 1, N + END DO + END DO + DO J = K + 1, N CALL SCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 ) - 40 CONTINUE + END DO * CALL SORMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK, $ M, WORK( M*N+1 ), LWORK-M*N, INFO ) * - DO 50 J = 1, N + DO J = 1, N * * Compare i-th column of QR and jpvt(i)-th column of A * CALL SAXPY( M, -ONE, A( 1, JPVT( J ) ), 1, WORK( ( J-1 )*M+1 ), $ 1 ) - 50 CONTINUE + END DO * SQPT01 = SLANGE( 'One-norm', M, N, WORK, M, RWORK ) / $ ( REAL( MAX( M, N ) )*SLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/sqrt11.f b/lapack-netlib/TESTING/LIN/sqrt11.f index d4422dacb..a3753adcf 100644 --- a/lapack-netlib/TESTING/LIN/sqrt11.f +++ b/lapack-netlib/TESTING/LIN/sqrt11.f @@ -157,9 +157,9 @@ CALL SORM2R( 'Left', 'Transpose', M, M, K, A, LDA, TAU, WORK, M, $ WORK( M*M+1 ), INFO ) * - DO 10 J = 1, M + DO J = 1, M WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE - 10 CONTINUE + END DO * SQRT11 = SLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) / $ ( REAL( M )*SLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/sqrt12.f b/lapack-netlib/TESTING/LIN/sqrt12.f index 2eab0ee0d..46b359e07 100644 --- a/lapack-netlib/TESTING/LIN/sqrt12.f +++ b/lapack-netlib/TESTING/LIN/sqrt12.f @@ -26,7 +26,7 @@ *> SQRT12 computes the singular values `svlues' of the upper trapezoid *> of A(1:M,1:N) and returns the ratio *> -*> || s - svlues||/(||svlues||*eps*max(M,N)) +*> || svlues - s ||/(||s||*eps*max(M,N)) *> \endverbatim * * Arguments: @@ -113,8 +113,7 @@ EXTERNAL SASUM, SLAMCH, SLANGE, SNRM2 * .. * .. External Subroutines .. - EXTERNAL SAXPY, SBDSQR, SGEBD2, SLABAD, SLASCL, SLASET, - $ XERBLA + EXTERNAL SAXPY, SBDSQR, SGEBD2, SLASCL, SLASET, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, REAL @@ -145,17 +144,16 @@ * Copy upper triangle of A into work * CALL SLASET( 'Full', M, N, ZERO, ZERO, WORK, M ) - DO 20 J = 1, N - DO 10 I = 1, MIN( J, M ) + DO J = 1, N + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = A( I, J ) - 10 CONTINUE - 20 CONTINUE + END DO + END DO * * Get machine parameters * SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale work if max entry outside range [SMLNUM,BIGNUM] * @@ -199,9 +197,9 @@ * ELSE * - DO 30 I = 1, MN + DO I = 1, MN WORK( M*N+I ) = ZERO - 30 CONTINUE + END DO END IF * * Compare s and singular values of work diff --git a/lapack-netlib/TESTING/LIN/zchkaa.F b/lapack-netlib/TESTING/LIN/zchkaa.F index a118515a5..f1020f2d8 100644 --- a/lapack-netlib/TESTING/LIN/zchkaa.F +++ b/lapack-netlib/TESTING/LIN/zchkaa.F @@ -69,6 +69,7 @@ *> ZLQ 8 List types on next line if 0 < NTYPES < 8 *> ZQL 8 List types on next line if 0 < NTYPES < 8 *> ZQP 6 List types on next line if 0 < NTYPES < 6 +*> ZQK 19 List types on next line if 0 < NTYPES < 19 *> ZTZ 3 List types on next line if 0 < NTYPES < 3 *> ZLS 6 List types on next line if 0 < NTYPES < 6 *> ZEQ @@ -153,12 +154,11 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - DOUBLE PRECISION S( 2*NMAX ) - COMPLEX*16 E( NMAX ) -* -* .. Allocatable Arrays .. +* .. +* .. Allocatable Arrays .. INTEGER AllocateStatus - DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE:: RWORK + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE:: RWORK, S + COMPLEX*16, DIMENSION(:), ALLOCATABLE :: E COMPLEX*16, DIMENSION(:,:), ALLOCATABLE:: A, B, WORK * .. * .. External Functions .. @@ -170,15 +170,16 @@ EXTERNAL ALAREQ, ZCHKEQ, ZCHKGB, ZCHKGE, ZCHKGT, ZCHKHE, $ ZCHKHE_ROOK, ZCHKHE_RK, ZCHKHE_AA, ZCHKHP, $ ZCHKLQ, ZCHKUNHR_COL, ZCHKPB, ZCHKPO, ZCHKPS, - $ ZCHKPP, ZCHKPT, ZCHKQ3, ZCHKQL, ZCHKQR, ZCHKRQ, - $ ZCHKSP, ZCHKSY, ZCHKSY_ROOK, ZCHKSY_RK, - $ ZCHKSY_AA, ZCHKTB, ZCHKTP, ZCHKTR, ZCHKTZ, - $ ZDRVGB, ZDRVGE, ZDRVGT, ZDRVHE, ZDRVHE_ROOK, - $ ZDRVHE_RK, ZDRVHE_AA, ZDRVHE_AA_2STAGE, ZDRVHP, - $ ZDRVLS, ZDRVPB, ZDRVPO, ZDRVPP, ZDRVPT, - $ ZDRVSP, ZDRVSY, ZDRVSY_ROOK, ZDRVSY_RK, - $ ZDRVSY_AA, ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT, - $ ZCHKQRTP, ZCHKLQT, ZCHKLQTP, ZCHKTSQR + $ ZCHKPP, ZCHKPT, ZCHKQ3, ZCHKQP3RK, ZCHKQL, + $ ZCHKQR, ZCHKRQ, ZCHKSP, ZCHKSY, ZCHKSY_ROOK, + $ ZCHKSY_RK, ZCHKSY_AA, ZCHKTB, ZCHKTP, ZCHKTR, + $ ZCHKTZ, ZDRVGB, ZDRVGE, ZDRVGT, ZDRVHE, + $ ZDRVHE_ROOK, ZDRVHE_RK, ZDRVHE_AA, + $ ZDRVHE_AA_2STAGE, ZDRVHP, ZDRVLS, ZDRVPB, + $ ZDRVPO, ZDRVPP, ZDRVPT, ZDRVSP, ZDRVSY, + $ ZDRVSY_ROOK, ZDRVSY_RK, ZDRVSY_AA, + $ ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT, ZCHKQRTP, + $ ZCHKLQT, ZCHKLQTP, ZCHKTSQR * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -197,13 +198,18 @@ DATA THREQ / 2.0D0 / , INTSTR / '0123456789' / * * .. Allocate memory dynamically .. - ALLOCATE (RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus) +* + ALLOCATE ( A ( (KDMAX+1) * NMAX, 7 ), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B ( NMAX * MAXRHS, 4 ), STAT = AllocateStatus) + IF (AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( WORK ( NMAX, NMAX+MAXRHS+10 ), STAT = AllocateStatus) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" - ALLOCATE (A ((KDMAX+1) * NMAX, 7), STAT = AllocateStatus) + ALLOCATE ( E( NMAX ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" - ALLOCATE (B (NMAX * MAXRHS, 4), STAT = AllocateStatus) + ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" - ALLOCATE (WORK (NMAX, NMAX+MAXRHS+10), STAT = AllocateStatus) + ALLOCATE ( RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * .. * .. Executable Statements .. @@ -1109,6 +1115,23 @@ ELSE WRITE( NOUT, FMT = 9989 )PATH END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* QK: truncated QR factorization with pivoting +* + NTYPES = 19 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ), + $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ S( 1 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF * ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN * @@ -1245,6 +1268,8 @@ * DEALLOCATE (A, STAT = AllocateStatus) DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (E, STAT = AllocateStatus) + DEALLOCATE (S, STAT = AllocateStatus) DEALLOCATE (RWORK, STAT = AllocateStatus) DEALLOCATE (WORK, STAT = AllocateStatus) * diff --git a/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f index 381fac9f2..51082f1d0 100644 --- a/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f @@ -8,7 +8,7 @@ * Definition: * =========== * -* SUBROUTINE ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB, NBVAL, +* SUBROUTINE ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB, NBVAL, * NNS, NSVAL, THRESH, TSTERR, NMAX, A, * AFAC, AINV, B, X, XACT, WORK, RWORK, * IWORK, NOUT ) @@ -185,7 +185,8 @@ LOGICAL DOTYPE( * ) INTEGER IWORK( * ), NBVAL( * ), NSVAL( * ), NVAL( * ) COMPLEX*16 A( * ), AFAC( * ), AINV( * ), B( * ), - $ RWORK( * ), WORK( * ), X( * ), XACT( * ) + $ WORK( * ), X( * ), XACT( * ) + DOUBLE PRECISION RWORK( * ) * .. * * ===================================================================== @@ -430,9 +431,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'ZHETRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) - CALL ZHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) + CALL ZHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) diff --git a/lapack-netlib/TESTING/LIN/zchkqp3rk.f b/lapack-netlib/TESTING/LIN/zchkqp3rk.f new file mode 100644 index 000000000..302c7b1a8 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/zchkqp3rk.f @@ -0,0 +1,836 @@ +*> \brief \b ZCHKQP3RK +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, +* $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, +* $ B, COPYB, S, TAU, +* $ WORK, RWORK, IWORK, NOUT ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER NM, NN, NNB, NOUT +* DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. +* LOGICAL DOTYPE( * ) +* INTEGER IWORK( * ), MVAL( * ), NBVAL( * ), NVAL( * ), +* $ NXVAL( * ) +* DOUBLE PRECISION S( * ), RWORK( * ) +* COMPLEX*16 A( * ), COPYA( * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZCHKQP3RK tests ZGEQP3RK. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] DOTYPE +*> \verbatim +*> DOTYPE is LOGICAL array, dimension (NTYPES) +*> The matrix types to be used for testing. Matrices of type j +*> (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) = +*> .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used. +*> \endverbatim +*> +*> \param[in] NM +*> \verbatim +*> NM is INTEGER +*> The number of values of M contained in the vector MVAL. +*> \endverbatim +*> +*> \param[in] MVAL +*> \verbatim +*> MVAL is INTEGER array, dimension (NM) +*> The values of the matrix row dimension M. +*> \endverbatim +*> +*> \param[in] NN +*> \verbatim +*> NN is INTEGER +*> The number of values of N contained in the vector NVAL. +*> \endverbatim +*> +*> \param[in] NVAL +*> \verbatim +*> NVAL is INTEGER array, dimension (NN) +*> The values of the matrix column dimension N. +*> \endverbatim +*> +*> \param[in] NNS +*> \verbatim +*> NNS is INTEGER +*> The number of values of NRHS contained in the vector NSVAL. +*> \endverbatim +*> +*> \param[in] NSVAL +*> \verbatim +*> NSVAL is INTEGER array, dimension (NNS) +*> The values of the number of right hand sides NRHS. +*> \endverbatim +*> \param[in] NNB +*> \verbatim +*> NNB is INTEGER +*> The number of values of NB and NX contained in the +*> vectors NBVAL and NXVAL. The blocking parameters are used +*> in pairs (NB,NX). +*> \endverbatim +*> +*> \param[in] NBVAL +*> \verbatim +*> NBVAL is INTEGER array, dimension (NNB) +*> The values of the blocksize NB. +*> \endverbatim +*> +*> \param[in] NXVAL +*> \verbatim +*> NXVAL is INTEGER array, dimension (NNB) +*> The values of the crossover point NX. +*> \endverbatim +*> +*> \param[in] THRESH +*> \verbatim +*> THRESH is DOUBLE PRECISION +*> The threshold value for the test ratios. A result is +*> included in the output file if RESULT >= THRESH. To have +*> every test ratio printed, use THRESH = 0. +*> \endverbatim +*> +*> \param[out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (MMAX*NMAX) +*> where MMAX is the maximum value of M in MVAL and NMAX is the +*> maximum value of N in NVAL. +*> \endverbatim +*> +*> \param[out] COPYA +*> \verbatim +*> COPYA is COMPLEX*16 array, dimension (MMAX*NMAX) +*> \endverbatim +*> +*> \param[out] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (MMAX*NSMAX) +*> where MMAX is the maximum value of M in MVAL and NSMAX is the +*> maximum value of NRHS in NSVAL. +*> \endverbatim +*> +*> \param[out] COPYB +*> \verbatim +*> COPYB is COMPLEX*16 array, dimension (MMAX*NSMAX) +*> \endverbatim +*> +*> \param[out] S +*> \verbatim +*> S is DOUBLE PRECISION array, dimension +*> (min(MMAX,NMAX)) +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX*16 array, dimension (MMAX) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension +*> (max(M*max(M,N) + 4*min(M,N) + max(M,N))) +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (4*NMAX) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (2*NMAX) +*> \endverbatim +*> +*> \param[in] NOUT +*> \verbatim +*> NOUT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16_lin +* +* ===================================================================== + SUBROUTINE ZCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, + $ B, COPYB, S, TAU, + $ WORK, RWORK, IWORK, NOUT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER NM, NN, NNB, NNS, NOUT + DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. + LOGICAL DOTYPE( * ) + INTEGER IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ), + $ NSVAL( * ), NXVAL( * ) + DOUBLE PRECISION S( * ), RWORK( * ) + COMPLEX*16 A( * ), COPYA( * ), B( * ), COPYB( * ), + $ TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NTYPES + PARAMETER ( NTYPES = 19 ) + INTEGER NTESTS + PARAMETER ( NTESTS = 5 ) + DOUBLE PRECISION ONE, ZERO, BIGNUM + COMPLEX*16 CONE, CZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0, + $ CZERO = ( 0.0D+0, 0.0D+0 ), + $ CONE = ( 1.0D+0, 0.0D+0 ), + $ BIGNUM = 1.0D+38 ) +* .. +* .. Local Scalars .. + CHARACTER DIST, TYPE + CHARACTER*3 PATH + INTEGER I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO, + $ INB, IND_OFFSET_GEN, + $ IND_IN, IND_OUT, INS, INFO, + $ ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO, + $ KFACT, KL, KMAX, KU, LDA, LW, LWORK, + $ LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N, + $ NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS, + $ NRUN, NX, T + DOUBLE PRECISION ANORM, CNDNUM, EPS, ABSTOL, RELTOL, + $ DTEMP, MAXC2NRMK, RELMAXC2NRMK +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ), ISEEDY( 4 ) + DOUBLE PRECISION RESULT( NTESTS ), RDUMMY( 1 ) +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH, ZQPT01, ZQRT11, ZQRT12, ZLANGE + EXTERNAL DLAMCH, ZQPT01, ZQRT11, ZQRT12, ZLANGE +* .. +* .. External Subroutines .. + EXTERNAL ALAERH, ALAHD, ALASUM, DLAORD, ICOPY, ZAXPY, + $ XLAENV, ZGEQP3RK, ZLACPY, ZLASET, ZLATB4, + $ ZLATMS, ZUNMQR, ZSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, MAX, MIN, MOD +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, IOUNIT, ZUNMQR_LWORK +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, IOUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEEDY / 1988, 1989, 1990, 1991 / +* .. +* .. Executable Statements .. +* +* Initialize constants and the random number seed. +* + PATH( 1: 1 ) = 'Zomplex precision' + PATH( 2: 3 ) = 'QK' + NRUN = 0 + NFAIL = 0 + NERRS = 0 + DO I = 1, 4 + ISEED( I ) = ISEEDY( I ) + END DO + EPS = DLAMCH( 'Epsilon' ) + INFOT = 0 +* + DO IM = 1, NM +* +* Do for each value of M in MVAL. +* + M = MVAL( IM ) + LDA = MAX( 1, M ) +* + DO IN = 1, NN +* +* Do for each value of N in NVAL. +* + N = NVAL( IN ) + MINMN = MIN( M, N ) + LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ), + $ M*N + 2*MINMN + 4*N ) +* + DO INS = 1, NNS + NRHS = NSVAL( INS ) +* +* Set up parameters with ZLATB4 and generate +* M-by-NRHS B matrix with ZLATMS. +* IMAT = 14: +* Random matrix, CNDNUM = 2, NORM = ONE, +* MODE = 3 (geometric distribution of singular values). +* + CALL ZLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'ZLATMS' + CALL ZLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYB, LDA, WORK, INFO ) +* +* Check error code from ZLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'ZLATMS', INFO, 0, ' ', M, + $ NRHS, -1, -1, -1, 6, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + DO IMAT = 1, NTYPES +* +* Do the tests only if DOTYPE( IMAT ) is true. +* + IF( .NOT.DOTYPE( IMAT ) ) + $ CYCLE +* +* The type of distribution used to generate the random +* eigen-/singular values: +* ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 ) +* +* Do for each type of NON-SYMMETRIC matrix: CNDNUM NORM MODE +* 1. Zero matrix +* 2. Random, Diagonal, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 3. Random, Upper triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 4. Random, Lower triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 5. Random, First column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 6. Random, Last MINMN column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 7. Random, Last N column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 8. Random, Middle column in MINMN is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 9. Random, First half of MINMN columns are zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 11. Random, Half MINMN columns in the middle are zero starting +* from MINMN/2-(MINMN/2)/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 12. Random, Odd columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 13. Random, Even columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 14. Random, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 15. Random, CNDNUM = sqrt(0.1/EPS) CNDNUM = BADC1 = sqrt(0.1/EPS) ONE 3 ( geometric distribution of singular values ) +* 16. Random, CNDNUM = 0.1/EPS CNDNUM = BADC2 = 0.1/EPS ONE 3 ( geometric distribution of singular values ) +* 17. Random, CNDNUM = 0.1/EPS, CNDNUM = BADC2 = 0.1/EPS ONE 2 ( one small singular value, S(N)=1/CNDNUM ) +* one small singular value S(N)=1/CNDNUM +* 18. Random, CNDNUM = 2, scaled near underflow CNDNUM = 2 SMALL = SAFMIN +* 19. Random, CNDNUM = 2, scaled near overflow CNDNUM = 2 LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) ) 3 ( geometric distribution of singular values ) +* + IF( IMAT.EQ.1 ) THEN +* +* Matrix 1: Zero matrix +* + CALL ZLASET( 'Full', M, N, CZERO, CZERO, COPYA, LDA ) + DO I = 1, MINMN + S( I ) = ZERO + END DO +* + ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 ) + $ .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN +* +* Matrices 2-5. +* +* Set up parameters with DLATB4 and generate a test +* matrix with ZLATMS. +* + CALL ZLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'ZLATMS' + CALL ZLATMS( M, N, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA, LDA, WORK, INFO ) +* +* Check error code from ZLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'ZLATMS', INFO, 0, ' ', M, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + CALL DLAORD( 'Decreasing', MINMN, S, 1 ) +* + ELSE IF( MINMN.GE.2 + $ .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN +* +* Rectangular matrices 5-13 that contain zero columns, +* only for matrices MINMN >=2. +* +* JB_ZERO is the column index of ZERO block. +* NB_ZERO is the column block size of ZERO block. +* NB_GEN is the column blcok size of the +* generated block. +* J_INC in the non_zero column index increment +* for matrix 12 and 13. +* J_FIRS_NZ is the index of the first non-zero +* column. +* + IF( IMAT.EQ.5 ) THEN +* +* First column is zero. +* + JB_ZERO = 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.6 ) THEN +* +* Last column MINMN is zero. +* + JB_ZERO = MINMN + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.7 ) THEN +* +* Last column N is zero. +* + JB_ZERO = N + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.8 ) THEN +* +* Middle column in MINMN is zero. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.9 ) THEN +* +* First half of MINMN columns is zero. +* + JB_ZERO = 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.10 ) THEN +* +* Last columns are zero columns, +* starting from (MINMN / 2 + 1) column. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = N - JB_ZERO + 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.11 ) THEN +* +* Half of the columns in the middle of MINMN +* columns is zero, starting from +* MINMN/2 - (MINMN/2)/2 + 1 column. +* + JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.12 ) THEN +* +* Odd-numbered columns are zero, +* + NB_GEN = N / 2 + NB_ZERO = N - NB_GEN + J_INC = 2 + J_FIRST_NZ = 2 +* + ELSE IF( IMAT.EQ.13 ) THEN +* +* Even-numbered columns are zero. +* + NB_ZERO = N / 2 + NB_GEN = N - NB_ZERO + J_INC = 2 + J_FIRST_NZ = 1 +* + END IF +* +* +* 1) Set the first NB_ZERO columns in COPYA(1:M,1:N) +* to zero. +* + CALL ZLASET( 'Full', M, NB_ZERO, CZERO, CZERO, + $ COPYA, LDA ) +* +* 2) Generate an M-by-(N-NB_ZERO) matrix with the +* chosen singular value distribution +* in COPYA(1:M,NB_ZERO+1:N). +* + CALL ZLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU, + $ ANORM, MODE, CNDNUM, DIST ) +* + SRNAMT = 'ZLATMS' +* + IND_OFFSET_GEN = NB_ZERO * LDA +* + CALL ZLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA( IND_OFFSET_GEN + 1 ), LDA, + $ WORK, INFO ) +* +* Check error code from ZLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'ZLATMS', INFO, 0, ' ', M, + $ NB_GEN, -1, -1, -1, IMAT, NFAIL, + $ NERRS, NOUT ) + CYCLE + END IF +* +* 3) Swap the gererated colums from the right side +* NB_GEN-size block in COPYA into correct column +* positions. +* + IF( IMAT.EQ.6 + $ .OR. IMAT.EQ.7 + $ .OR. IMAT.EQ.8 + $ .OR. IMAT.EQ.10 + $ .OR. IMAT.EQ.11 ) THEN +* +* Move by swapping the generated columns +* from the right NB_GEN-size block from +* (NB_ZERO+1:NB_ZERO+JB_ZERO) +* into columns (1:JB_ZERO-1). +* + DO J = 1, JB_ZERO-1, 1 + CALL ZSWAP( M, + $ COPYA( ( NB_ZERO+J-1)*LDA+1), 1, + $ COPYA( (J-1)*LDA + 1 ), 1 ) + END DO +* + ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN +* +* ( IMAT = 12, Odd-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the even zero colums in the +* left NB_ZERO-size block. +* +* ( IMAT = 13, Even-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the odd zero colums in the +* left NB_ZERO-size block. +* + DO J = 1, NB_GEN, 1 + IND_OUT = ( NB_ZERO+J-1 )*LDA + 1 + IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA + $ + 1 + CALL ZSWAP( M, + $ COPYA( IND_OUT ), 1, + $ COPYA( IND_IN), 1 ) + END DO +* + END IF +* +* 5) Order the singular values generated by +* DLAMTS in decreasing order and add trailing zeros +* that correspond to zero columns. +* The total number of singular values is MINMN. +* + MINMNB_GEN = MIN( M, NB_GEN ) +* + CALL DLAORD( 'Decreasing', MINMNB_GEN, S, 1 ) + + DO I = MINMNB_GEN+1, MINMN + S( I ) = ZERO + END DO +* + ELSE +* +* IF(MINMN.LT.2) skip this size for this matrix type. +* + CYCLE + END IF +* +* Initialize a copy array for a pivot array for DGEQP3RK. +* + DO I = 1, N + IWORK( I ) = 0 + END DO +* + DO INB = 1, NNB +* +* Do for each pair of values (NB,NX) in NBVAL and NXVAL. +* + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) + NX = NXVAL( INB ) + CALL XLAENV( 3, NX ) +* +* We do MIN(M,N)+1 because we need a test for KMAX > N, +* when KMAX is larger than MIN(M,N), KMAX should be +* KMAX = MIN(M,N) +* + DO KMAX = 0, MIN(M,N)+1 +* +* Get a working copy of COPYA into A( 1:M,1:N ). +* Get a working copy of COPYB into A( 1:M, (N+1):NRHS ). +* Get a working copy of COPYB into into B( 1:M, 1:NRHS ). +* Get a working copy of IWORK(1:N) awith zeroes into +* which is going to be used as pivot array IWORK( N+1:2N ). +* NOTE: IWORK(2N+1:3N) is going to be used as a WORK array +* for the routine. +* + CALL ZLACPY( 'All', M, N, COPYA, LDA, A, LDA ) + CALL ZLACPY( 'All', M, NRHS, COPYB, LDA, + $ A( LDA*N + 1 ), LDA ) + CALL ZLACPY( 'All', M, NRHS, COPYB, LDA, + $ B, LDA ) + CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 ) +* + ABSTOL = -1.0 + RELTOl = -1.0 +* +* Compute the QR factorization with pivoting of A +* + LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ), + $ 3*N + NRHS - 1 ) ) +* +* Compute ZGEQP3RK factorization of A. +* + SRNAMT = 'ZGEQP3RK' + CALL ZGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ A, LDA, KFACT, MAXC2NRMK, + $ RELMAXC2NRMK, IWORK( N+1 ), TAU, + $ WORK, LW, RWORK, IWORK( 2*N+1 ), + $ INFO ) +* +* Check error code from ZGEQP3RK. +* + IF( INFO.LT.0 ) + $ CALL ALAERH( PATH, 'ZGEQP3RK', INFO, 0, ' ', + $ M, N, NX, -1, NB, IMAT, + $ NFAIL, NERRS, NOUT ) +* + IF( KFACT.EQ.MINMN ) THEN +* +* Compute test 1: +* +* This test in only for the full rank factorization of +* the matrix A. +* +* Array S(1:min(M,N)) contains svd(A) the sigular values +* of the original matrix A in decreasing absolute value +* order. The test computes svd(R), the vector sigular +* values of the upper trapezoid of A(1:M,1:N) that +* contains the factor R, in decreasing order. The test +* returns the ratio: +* +* 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS ) +* + RESULT( 1 ) = ZQRT12( M, N, A, LDA, S, WORK, + $ LWORK , RWORK ) +* + DO T = 1, 1 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, NB, NX, + $ IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 1 +* + END IF + +* Compute test 2: +* +* The test returns the ratio: +* +* 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS ) +* + RESULT( 2 ) = ZQPT01( M, N, KFACT, COPYA, A, LDA, TAU, + $ IWORK( N+1 ), WORK, LWORK ) +* +* Compute test 3: +* +* The test returns the ratio: +* +* 1-norm( Q**T * Q - I ) / ( M * EPS ) +* + RESULT( 3 ) = ZQRT11( M, KFACT, A, LDA, TAU, WORK, + $ LWORK ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 2, 3 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 +* +* Compute test 4: +* +* This test is only for the factorizations with the +* rank greater than 2. +* The elements on the diagonal of R should be non- +* increasing. +* +* The test returns the ratio: +* +* Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)), +* K=1:KFACT-1 +* + IF( MIN(KFACT, MINMN).GE.2 ) THEN +* + DO J = 1, KFACT-1, 1 +* + DTEMP = (( ABS( A( (J-1)*M+J ) ) - + $ ABS( A( (J)*M+J+1 ) ) ) / + $ ABS( A(1) ) ) +* + IF( DTEMP.LT.ZERO ) THEN + RESULT( 4 ) = BIGNUM + END IF +* + END DO +* +* Print information about the tests that did not +* pass the threshold. +* + DO T = 4, 4 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', + $ M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, + $ RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 4. +* + END IF +* +* Compute test 5: +* +* This test in only for matrix A with min(M,N) > 0. +* +* The test returns the ratio: +* +* 1-norm(Q**T * B - Q**T * B ) / +* ( M * EPS ) +* +* (1) Compute B:=Q**T * B in the matrix B. +* + IF( MINMN.GT.0 ) THEN +* + LWORK_MQR = MAX(1, NRHS) + CALL ZUNMQR( 'Left', 'Conjugate transpose', + $ M, NRHS, KFACT, A, LDA, TAU, B, LDA, + $ WORK, LWORK_MQR, INFO ) +* + DO I = 1, NRHS +* +* Compare N+J-th column of A and J-column of B. +* + CALL ZAXPY( M, -CONE, A( ( N+I-1 )*LDA+1 ), 1, + $ B( ( I-1 )*LDA+1 ), 1 ) + END DO +* + RESULT( 5 ) = + $ ABS( + $ ZLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) / + $ ( DBLE( M )*DLAMCH( 'Epsilon' ) ) + $ ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 5, 5 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End compute test 5. +* + END IF +* +* END DO KMAX = 1, MIN(M,N)+1 +* + END DO +* +* END DO for INB = 1, NNB +* + END DO +* +* END DO for IMAT = 1, NTYPES +* + END DO +* +* END DO for INS = 1, NNS +* + END DO +* +* END DO for IN = 1, NN +* + END DO +* +* END DO for IM = 1, NM +* + END DO +* +* Print a summary of the results. +* + CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) +* + 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5, + $ ', KMAX =', I5, ', ABSTOL =', G12.5, + $ ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4, + $ ', type ', I2, ', test ', I2, ', ratio =', G12.5 ) +* +* End of ZCHKQP3RK +* + END diff --git a/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f index 9401867e0..fcd774491 100644 --- a/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using ZHESV_AA. * SRNAMT = 'ZHESV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL ZHESV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/zlatb4.f b/lapack-netlib/TESTING/LIN/zlatb4.f index a6977f4e9..a2b19f83d 100644 --- a/lapack-netlib/TESTING/LIN/zlatb4.f +++ b/lapack-netlib/TESTING/LIN/zlatb4.f @@ -154,9 +154,6 @@ * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. -* .. External Subroutines .. - EXTERNAL DLABAD -* .. * .. Save statement .. SAVE EPS, SMALL, LARGE, BADC1, BADC2, FIRST * .. @@ -174,11 +171,6 @@ BADC1 = SQRT( BADC2 ) SMALL = DLAMCH( 'Safe minimum' ) LARGE = ONE / SMALL -* -* If it looks like we're on a Cray, take the square root of -* SMALL and LARGE to avoid overflow and underflow problems. -* - CALL DLABAD( SMALL, LARGE ) SMALL = SHRINK*( SMALL / EPS ) LARGE = ONE / SMALL END IF @@ -233,6 +225,110 @@ ELSE ANORM = ONE END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* xQK: truncated QR with pivoting. +* Set parameters to generate a general +* M x N matrix. +* +* Set TYPE, the type of matrix to be generated. 'N' is nonsymmetric. +* + TYPE = 'N' +* +* Set DIST, the type of distribution for the random +* number generator. 'S' is +* + DIST = 'S' +* +* Set the lower and upper bandwidths. +* + IF( IMAT.EQ.2 ) THEN +* +* 2. Random, Diagonal, CNDNUM = 2 +* + KL = 0 + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.3 ) THEN +* +* 3. Random, Upper triangular, CNDNUM = 2 +* + KL = 0 + KU = MAX( N-1, 0 ) + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.4 ) THEN +* +* 4. Random, Lower triangular, CNDNUM = 2 +* + KL = MAX( M-1, 0 ) + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE +* +* 5.-19. Rectangular matrix +* + KL = MAX( M-1, 0 ) + KU = MAX( N-1, 0 ) +* + IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN +* +* 5.-14. Random, CNDNUM = 2. +* + CNDNUM = TWO + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.15 ) THEN +* +* 15. Random, CNDNUM = sqrt(0.1/EPS) +* + CNDNUM = BADC1 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.16 ) THEN +* +* 16. Random, CNDNUM = 0.1/EPS +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.17 ) THEN +* +* 17. Random, CNDNUM = 0.1/EPS, +* one small singular value S(N)=1/CNDNUM +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 2 +* + ELSE IF( IMAT.EQ.18 ) THEN +* +* 18. Random, scaled near underflow +* + CNDNUM = TWO + ANORM = SMALL + MODE = 3 +* + ELSE IF( IMAT.EQ.19 ) THEN +* +* 19. Random, scaled near overflow +* + CNDNUM = TWO + ANORM = LARGE + MODE = 3 +* + END IF +* + END IF * ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN * @@ -517,17 +613,18 @@ * * Set the norm and condition number. * - IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN + MAT = ABS( IMAT ) + IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN CNDNUM = BADC1 - ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN + ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN CNDNUM = BADC2 ELSE CNDNUM = TWO END IF * - IF( IMAT.EQ.4 ) THEN + IF( MAT.EQ.4 ) THEN ANORM = SMALL - ELSE IF( IMAT.EQ.5 ) THEN + ELSE IF( MAT.EQ.5 ) THEN ANORM = LARGE ELSE ANORM = ONE diff --git a/lapack-netlib/TESTING/LIN/zqpt01.f b/lapack-netlib/TESTING/LIN/zqpt01.f index 4e53f92c8..c69eb658f 100644 --- a/lapack-netlib/TESTING/LIN/zqpt01.f +++ b/lapack-netlib/TESTING/LIN/zqpt01.f @@ -33,7 +33,7 @@ *> Householder vectors, and the rest of AF contains a partially updated *> matrix. *> -*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M) +*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) ) *> \endverbatim * * Arguments: @@ -172,28 +172,28 @@ * NORMA = ZLANGE( 'One-norm', M, N, A, LDA, RWORK ) * - DO 30 J = 1, K - DO 10 I = 1, MIN( J, M ) + DO J = 1, K + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = AF( I, J ) - 10 CONTINUE - DO 20 I = J + 1, M + END DO + DO I = J + 1, M WORK( ( J-1 )*M+I ) = ZERO - 20 CONTINUE - 30 CONTINUE - DO 40 J = K + 1, N + END DO + END DO + DO J = K + 1, N CALL ZCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 ) - 40 CONTINUE + END DO * CALL ZUNMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK, $ M, WORK( M*N+1 ), LWORK-M*N, INFO ) * - DO 50 J = 1, N + DO J = 1, N * * Compare i-th column of QR and jpvt(i)-th column of A * CALL ZAXPY( M, DCMPLX( -ONE ), A( 1, JPVT( J ) ), 1, $ WORK( ( J-1 )*M+1 ), 1 ) - 50 CONTINUE + END DO * ZQPT01 = ZLANGE( 'One-norm', M, N, WORK, M, RWORK ) / $ ( DBLE( MAX( M, N ) )*DLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/zqrt11.f b/lapack-netlib/TESTING/LIN/zqrt11.f index c3be59c36..dc4af744f 100644 --- a/lapack-netlib/TESTING/LIN/zqrt11.f +++ b/lapack-netlib/TESTING/LIN/zqrt11.f @@ -158,9 +158,9 @@ CALL ZUNM2R( 'Left', 'Conjugate transpose', M, M, K, A, LDA, TAU, $ WORK, M, WORK( M*M+1 ), INFO ) * - DO 10 J = 1, M + DO J = 1, M WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE - 10 CONTINUE + END DO * ZQRT11 = ZLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) / $ ( DBLE( M )*DLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/zqrt12.f b/lapack-netlib/TESTING/LIN/zqrt12.f index 0da6be157..91477b5ea 100644 --- a/lapack-netlib/TESTING/LIN/zqrt12.f +++ b/lapack-netlib/TESTING/LIN/zqrt12.f @@ -28,7 +28,7 @@ *> ZQRT12 computes the singular values `svlues' of the upper trapezoid *> of A(1:M,1:N) and returns the ratio *> -*> || s - svlues||/(||svlues||*eps*max(M,N)) +*> || svlues - s||/(||s||*eps*max(M,N)) *> \endverbatim * * Arguments: @@ -125,8 +125,8 @@ EXTERNAL DASUM, DLAMCH, DNRM2, ZLANGE * .. * .. External Subroutines .. - EXTERNAL DAXPY, DBDSQR, DLABAD, DLASCL, XERBLA, ZGEBD2, - $ ZLASCL, ZLASET + EXTERNAL DAXPY, DBDSQR, DLASCL, XERBLA, ZGEBD2, ZLASCL, + $ ZLASET * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN @@ -154,17 +154,16 @@ * CALL ZLASET( 'Full', M, N, DCMPLX( ZERO ), DCMPLX( ZERO ), WORK, $ M ) - DO 20 J = 1, N - DO 10 I = 1, MIN( J, M ) + DO J = 1, N + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = A( I, J ) - 10 CONTINUE - 20 CONTINUE + END DO + END DO * * Get machine parameters * SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale work if max entry outside range [SMLNUM,BIGNUM] * @@ -208,9 +207,9 @@ * ELSE * - DO 30 I = 1, MN + DO I = 1, MN RWORK( I ) = ZERO - 30 CONTINUE + END DO END IF * * Compare s and singular values of work @@ -218,6 +217,7 @@ CALL DAXPY( MN, -ONE, S, 1, RWORK( 1 ), 1 ) ZQRT12 = DASUM( MN, RWORK( 1 ), 1 ) / $ ( DLAMCH( 'Epsilon' )*DBLE( MAX( M, N ) ) ) +* IF( NRMSVL.NE.ZERO ) $ ZQRT12 = ZQRT12 / NRMSVL * diff --git a/lapack-netlib/TESTING/Makefile b/lapack-netlib/TESTING/Makefile index bdea2bfaa..3963260ac 100644 --- a/lapack-netlib/TESTING/Makefile +++ b/lapack-netlib/TESTING/Makefile @@ -61,6 +61,8 @@ SEIGTST= snep.out \ scsd.out \ slse.out +SDMDEIGTST= sdmd.out + CEIGTST= cnep.out \ csep.out \ cse2.out \ @@ -82,6 +84,8 @@ CEIGTST= cnep.out \ ccsd.out \ clse.out +CDMDEIGTST= cdmd.out + DEIGTST= dnep.out \ dsep.out \ dse2.out \ @@ -103,6 +107,8 @@ DEIGTST= dnep.out \ dcsd.out \ dlse.out +DDMDEIGTST= ddmd.out + ZEIGTST= znep.out \ zsep.out \ zse2.out \ @@ -124,6 +130,7 @@ ZEIGTST= znep.out \ zcsd.out \ zlse.out +ZDMDEIGTST= zdmd.out SLINTST= stest.out @@ -142,10 +149,10 @@ ZLINTST= ztest.out ZLINTSTPROTO= zctest.out ztest_rfp.out .PHONY: single complex double complex16 -single: $(SLINTST) $(SEIGTST) -complex: $(CLINTST) $(CEIGTST) -double: $(DLINTST) $(DEIGTST) -complex16: $(ZLINTST) $(ZEIGTST) +single: $(SLINTST) $(SEIGTST) $(SDMDEIGTST) +complex: $(CLINTST) $(CEIGTST) $(CDMDEIGTST) +double: $(DLINTST) $(DEIGTST) $(DDMDEIGTST) +complex16: $(ZLINTST) $(ZEIGTST) $(ZDMDEIGTST) .PHONY: singleproto complexproto doubleproto complex16proto singleproto: $(SLINTSTPROTO) @@ -297,6 +304,10 @@ scsd.out: csd.in EIG/xeigtsts slse.out: lse.in EIG/xeigtsts @echo LSE: Testing Constrained Linear Least Squares routines ./EIG/xeigtsts < lse.in > $@ 2>&1 + +sdmd.out: sdmd.in EIG/xdmdeigtsts + @echo DMD: Testing Dynamic Mode Decomposition routines + ./EIG/xdmdeigtsts < sdmd.in > $@ 2>&1 # # ======== COMPLEX EIG TESTS =========================== @@ -379,6 +390,10 @@ ccsd.out: csd.in EIG/xeigtstc clse.out: lse.in EIG/xeigtstc @echo LSE: Testing Constrained Linear Least Squares routines ./EIG/xeigtstc < lse.in > $@ 2>&1 + +cdmd.out: cdmd.in EIG/xdmdeigtstc + @echo DMD: Testing Dynamic Mode Decomposition routines + ./EIG/xdmdeigtstc < cdmd.in > $@ 2>&1 # # ======== DOUBLE EIG TESTS =========================== @@ -461,6 +476,10 @@ dcsd.out: csd.in EIG/xeigtstd dlse.out: lse.in EIG/xeigtstd @echo LSE: Testing Constrained Linear Least Squares routines ./EIG/xeigtstd < lse.in > $@ 2>&1 + +ddmd.out: ddmd.in EIG/xdmdeigtstd + @echo DMD: Testing Dynamic Mode Decomposition routines + ./EIG/xdmdeigtstd < ddmd.in > $@ 2>&1 # # ======== COMPLEX16 EIG TESTS =========================== @@ -543,6 +562,10 @@ zcsd.out: csd.in EIG/xeigtstz zlse.out: lse.in EIG/xeigtstz @echo LSE: Testing Constrained Linear Least Squares routines ./EIG/xeigtstz < lse.in > $@ 2>&1 + +zdmd.out: zdmd.in EIG/xdmdeigtstz + @echo DMD: Testing Dynamic Mode Decomposition routines + ./EIG/xdmdeigtstz < zdmd.in > $@ 2>&1 # ============================================================================== LIN/xlintsts: $(FRCLIN) $(FRC) @@ -578,15 +601,27 @@ LIN/xlintstzc: $(FRCLIN) $(FRC) EIG/xeigtsts: $(FRCEIG) $(FRC) $(MAKE) -C EIG xeigtsts +EIG/xdmdeigtsts: $(FRCEIG) $(FRC) + $(MAKE) -C EIG xdmdeigtsts + EIG/xeigtstc: $(FRCEIG) $(FRC) $(MAKE) -C EIG xeigtstc +EIG/xdmdeigtstc: $(FRCEIG) $(FRC) + $(MAKE) -C EIG xdmdeigtstc + EIG/xeigtstd: $(FRCEIG) $(FRC) $(MAKE) -C EIG xeigtstd +EIG/xdmdeigtstd: $(FRCEIG) $(FRC) + $(MAKE) -C EIG xdmdeigtstd + EIG/xeigtstz: $(FRCEIG) $(FRC) $(MAKE) -C EIG xeigtstz +EIG/xdmdeigtstz: $(FRCEIG) $(FRC) + $(MAKE) -C EIG xdmdeigtstz + .PHONY: clean cleantest clean: cleantest cleantest: diff --git a/lapack-netlib/TESTING/cdmd.in b/lapack-netlib/TESTING/cdmd.in new file mode 100644 index 000000000..42d046e01 --- /dev/null +++ b/lapack-netlib/TESTING/cdmd.in @@ -0,0 +1,11 @@ +10 +5 + +20 +10 + +30 +11 + +50 +20 diff --git a/lapack-netlib/TESTING/ctest.in b/lapack-netlib/TESTING/ctest.in index a3588b4a1..74ff31ab8 100644 --- a/lapack-netlib/TESTING/ctest.in +++ b/lapack-netlib/TESTING/ctest.in @@ -42,6 +42,7 @@ CRQ 8 List types on next line if 0 < NTYPES < 8 CLQ 8 List types on next line if 0 < NTYPES < 8 CQL 8 List types on next line if 0 < NTYPES < 8 CQP 6 List types on next line if 0 < NTYPES < 6 +CQK 19 List types on next line if 0 < NTYPES < 19 CTZ 3 List types on next line if 0 < NTYPES < 3 CLS 6 List types on next line if 0 < NTYPES < 6 CEQ diff --git a/lapack-netlib/TESTING/ddmd.in b/lapack-netlib/TESTING/ddmd.in new file mode 100644 index 000000000..42d046e01 --- /dev/null +++ b/lapack-netlib/TESTING/ddmd.in @@ -0,0 +1,11 @@ +10 +5 + +20 +10 + +30 +11 + +50 +20 diff --git a/lapack-netlib/TESTING/dtest.in b/lapack-netlib/TESTING/dtest.in index 29bb8b92e..1b6c7bd4a 100644 --- a/lapack-netlib/TESTING/dtest.in +++ b/lapack-netlib/TESTING/dtest.in @@ -36,6 +36,7 @@ DRQ 8 List types on next line if 0 < NTYPES < 8 DLQ 8 List types on next line if 0 < NTYPES < 8 DQL 8 List types on next line if 0 < NTYPES < 8 DQP 6 List types on next line if 0 < NTYPES < 6 +DQK 19 LIst types on next line if 0 < NTYPES < 19 DTZ 3 List types on next line if 0 < NTYPES < 3 DLS 6 List types on next line if 0 < NTYPES < 6 DEQ diff --git a/lapack-netlib/TESTING/sdmd.in b/lapack-netlib/TESTING/sdmd.in new file mode 100644 index 000000000..42d046e01 --- /dev/null +++ b/lapack-netlib/TESTING/sdmd.in @@ -0,0 +1,11 @@ +10 +5 + +20 +10 + +30 +11 + +50 +20 diff --git a/lapack-netlib/TESTING/stest.in b/lapack-netlib/TESTING/stest.in index 27ac30040..7faa8b7a1 100644 --- a/lapack-netlib/TESTING/stest.in +++ b/lapack-netlib/TESTING/stest.in @@ -36,6 +36,7 @@ SRQ 8 List types on next line if 0 < NTYPES < 8 SLQ 8 List types on next line if 0 < NTYPES < 8 SQL 8 List types on next line if 0 < NTYPES < 8 SQP 6 List types on next line if 0 < NTYPES < 6 +SQK 19 List types on next line if 0 < NTYPES < 19 STZ 3 List types on next line if 0 < NTYPES < 3 SLS 6 List types on next line if 0 < NTYPES < 6 SEQ diff --git a/lapack-netlib/TESTING/zdmd.in b/lapack-netlib/TESTING/zdmd.in new file mode 100644 index 000000000..42d046e01 --- /dev/null +++ b/lapack-netlib/TESTING/zdmd.in @@ -0,0 +1,11 @@ +10 +5 + +20 +10 + +30 +11 + +50 +20 diff --git a/lapack-netlib/TESTING/ztest.in b/lapack-netlib/TESTING/ztest.in index 58da33d60..c83e82e45 100644 --- a/lapack-netlib/TESTING/ztest.in +++ b/lapack-netlib/TESTING/ztest.in @@ -42,6 +42,7 @@ ZRQ 8 List types on next line if 0 < NTYPES < 8 ZLQ 8 List types on next line if 0 < NTYPES < 8 ZQL 8 List types on next line if 0 < NTYPES < 8 ZQP 6 List types on next line if 0 < NTYPES < 6 +ZQK 19 List types on next line if 0 < NTYPES < 19 ZTZ 3 List types on next line if 0 < NTYPES < 3 ZLS 6 List types on next line if 0 < NTYPES < 6 ZEQ diff --git a/lapack-netlib/lapack_testing.py b/lapack-netlib/lapack_testing.py index 5582744a0..fea0ff765 100755 --- a/lapack-netlib/lapack_testing.py +++ b/lapack-netlib/lapack_testing.py @@ -1,31 +1,29 @@ -#! /usr/bin/env python -# -*- coding: utf-8 -*- +#!/usr/bin/env python3 ############################################################################### # lapack_testing.py ############################################################################### -from __future__ import print_function from subprocess import Popen, STDOUT, PIPE import os, sys, math import getopt # Arguments try: opts, args = getopt.getopt(sys.argv[1:], "hd:b:srep:t:n", - ["help", "dir", "bin", "short", "run", "error","prec=","test=","number"]) + ["help", "dir=", "bin=", "short", "run", "error","prec=","test=","number"]) except getopt.error as msg: print(msg) print("for help use --help") sys.exit(2) -short_summary=0 -with_file=1 -just_errors = 0 +short_summary = False +with_file = True +just_errors = False prec='x' test='all' -only_numbers=0 +only_numbers = False test_dir='TESTING' bin_dir='bin/Release' @@ -34,10 +32,9 @@ for o, a in opts: print(sys.argv[0]+" [-h|--help] [-d dir |--dir dir] [-s |--short] [-r |--run] [-e |--error] [-p p |--prec p] [-t test |--test test] [-n | --number]") print(" - h is to print this message") print(" - r is to use to run the LAPACK tests then analyse the output (.out files). By default, the script will not run all the LAPACK tests") - print(" - d [dir] is to indicate where is the LAPACK testing directory (.out files). By default, the script will use .") - print(" - b [bin] is to indicate where is the LAPACK binary files are located. By default, the script will use .") + print(" - d [dir] indicates the location of the LAPACK testing directory (.out files). By default, the script will use {:s}.".format(test_dir)) + print(" - b [bin] indicates the location of the LAPACK binary files. By default, the script will use {:s}.".format(bin_dir)) print(" LEVEL OF OUTPUT") - print(" - x is to print a detailed summary") print(" - e is to print only the error summary") print(" - s is to print a short summary") print(" - n is to print the numbers of failing tests (turn on summary mode)") @@ -63,15 +60,14 @@ for o, a in opts: print(" Will return the numbers of failed tests in REAL precision by running the LAPACK Tests then analyzing the output") print(" ./lapack_testing.py -n -p s -t eig ") print(" Will return the numbers of failed tests in REAL precision by analyzing only the LAPACK output of EIGEN testings") - print("Written by Julie Langou (June 2011) ") sys.exit(0) else: if o in ("-s", "--short"): - short_summary = 1 + short_summary = True if o in ("-r", "--run"): - with_file = 0 + with_file = False if o in ("-e", "--error"): - just_errors = 1 + just_errors = True if o in ( '-p', '--prec' ): prec = a if o in ( '-b', '--bin' ): @@ -81,12 +77,12 @@ for o, a in opts: if o in ( '-t', '--test' ): test = a if o in ( '-n', '--number' ): - only_numbers = 1 - short_summary = 1 + only_numbers = True + short_summary = True # process options -abs_bin_dir=os.path.normpath(os.path.join(os.getcwd(),bin_dir)) +abs_bin_dir=os.path.abspath(bin_dir) os.chdir(test_dir) @@ -108,7 +104,7 @@ def run_summary_test( f, cmdline, short_summary): nb_test_illegal=0 nb_test_info=0 - if (with_file): + if with_file: if not os.path.exists(cmdline): error_message=cmdline+" file not found" r=1 @@ -140,21 +136,21 @@ def run_summary_test( f, cmdline, short_summary): for line in pipe.readlines(): f.write(str(line)) words_in_line=line.split() - if (line.find("run")!=-1): + if (line.find("run)")!=-1): # print line whereisrun=words_in_line.index("run)") nb_test_run+=int(words_in_line[whereisrun-2]) if (line.find("out of")!=-1): - if (short_summary==0): print(line, end=' ') + if not short_summary: print(line, end=' ') whereisout= words_in_line.index("out") nb_test_fail+=int(words_in_line[whereisout-1]) if ((line.find("illegal")!=-1) or (line.find("Illegal")!=-1)): - if (short_summary==0):print(line, end=' ') + if not short_summary: print(line, end=' ') nb_test_illegal+=1 if (line.find(" INFO")!=-1): - if (short_summary==0):print(line, end=' ') + if not short_summary: print(line, end=' ') nb_test_info+=1 - if (with_file==1): + if with_file: pipe.close() f.flush(); @@ -169,7 +165,7 @@ try: except IOError: f = sys.stdout -if (short_summary==0): +if not short_summary: print(" ") print("---------------- Testing LAPACK Routines ----------------") print(" ") @@ -203,6 +199,8 @@ elif test=='mixed': range_prec=[1,3] elif test=='rfp': range_test=[18] +elif test=='dmd': + range_test=[20] elif test=='eig': range_test=list(range(16)) else: @@ -219,7 +217,7 @@ for dtype in range_prec: letter = dtypes[0][dtype] name = dtypes[1][dtype] - if (short_summary==0): + if not short_summary: print(" ") print("------------------------- %s ------------------------" % name) print(" ") @@ -231,19 +229,19 @@ for dtype in range_prec: letter+"gd",letter+"sb",letter+"sg", letter+"bb","glm","gqr", "gsv","csd","lse", - letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp"), + letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp",letter+"dmd"), ("Nonsymmetric-Eigenvalue-Problem", "Symmetric-Eigenvalue-Problem", "Symmetric-Eigenvalue-Problem-2-stage", "Singular-Value-Decomposition", "Eigen-Condition","Nonsymmetric-Eigenvalue","Nonsymmetric-Generalized-Eigenvalue-Problem", "Nonsymmetric-Generalized-Eigenvalue-Problem-driver", "Symmetric-Eigenvalue-Problem", "Symmetric-Eigenvalue-Generalized-Problem", "Banded-Singular-Value-Decomposition-routines", "Generalized-Linear-Regression-Model-routines", "Generalized-QR-and-RQ-factorization-routines", "Generalized-Singular-Value-Decomposition-routines", "CS-Decomposition-routines", "Constrained-Linear-Least-Squares-routines", - "Linear-Equation-routines", "Mixed-Precision-linear-equation-routines","RFP-linear-equation-routines"), + "Linear-Equation-routines", "Mixed-Precision-linear-equation-routines","RFP-linear-equation-routines","Dynamic-Mode-Decomposition"), (letter+"nep", letter+"sep", letter+"se2", letter+"svd", letter+"ec",letter+"ed",letter+"gg", letter+"gd",letter+"sb",letter+"sg", letter+"bb",letter+"glm",letter+"gqr", letter+"gsv",letter+"csd",letter+"lse", - letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp"), + letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp",letter+"dmd"), ) @@ -252,7 +250,7 @@ for dtype in range_prec: # NEED TO SKIP SOME PRECISION (namely s and c) FOR PROTO MIXED PRECISION TESTING if dtest==17 and (letter=="s" or letter=="c"): continue - if (with_file==1): + if with_file: cmdbase=dtests[2][dtest]+".out" else: if dtest==16: @@ -264,10 +262,13 @@ for dtype in range_prec: elif dtest==18: # PROTO LIN TESTS cmdbase="LIN/xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + elif dtest==20: + # DMD EIG TESTS + cmdbase="EIG/xdmdeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" else: # EIG TESTS cmdbase="EIG/xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" - if (not just_errors and not short_summary): + if not just_errors and not short_summary: print("Testing "+name+" "+dtests[1][dtest]+"-"+cmdbase, end=' ') # Run the process: either to read the file or run the LAPACK testing nb_test = run_summary_test(f, cmdbase, short_summary) @@ -277,19 +278,19 @@ for dtype in range_prec: list_results[3][dtype]+=nb_test[3] got_error=nb_test[1]+nb_test[2]+nb_test[3] - if (not short_summary): - if (nb_test[0]>0 and just_errors==0): + if not short_summary: + if nb_test[0] > 0 and not just_errors: print("passed: "+str(nb_test[0])) - if (nb_test[1]>0): + if nb_test[1] > 0: print("failing to pass the threshold: "+str(nb_test[1])) - if (nb_test[2]>0): + if nb_test[2] > 0: print("Illegal Error: "+str(nb_test[2])) - if (nb_test[3]>0): + if nb_test[3] > 0: print("Info Error: "+str(nb_test[3])) - if (got_error>0 and just_errors==1): + if got_error > 0 and just_errors: print("ERROR IS LOCATED IN "+name+" "+dtests[1][dtest]+" [ "+cmdbase+" ]") print("") - if (just_errors==0): + if not just_errors: print("") # elif (got_error>0): # print dtests[2][dtest]+".out \t"+str(nb_test[1])+"\t"+str(nb_test[2])+"\t"+str(nb_test[3]) @@ -307,7 +308,7 @@ for dtype in range_prec: list_results[2][4]+=list_results[2][dtype] list_results[3][4]+=list_results[3][dtype] -if only_numbers==1: +if only_numbers: print(str(list_results[1][4])+"\n"+str(list_results[2][4]+list_results[3][4])) else: print(summary) diff --git a/param.h b/param.h index ee4640f57..469c38ce3 100644 --- a/param.h +++ b/param.h @@ -2853,13 +2853,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define QGEMM_DEFAULT_UNROLL_N 2 -#define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #define QGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_M 1 -#define ZGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_P 256 @@ -2888,11 +2888,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 1 -#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_N 4 @@ -3359,13 +3359,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P 128 -#define DGEMM_DEFAULT_P 160 +#define SGEMM_DEFAULT_P 240 +#define DGEMM_DEFAULT_P 240 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q 352 -#define DGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 320 #define CGEMM_DEFAULT_Q 224 #define ZGEMM_DEFAULT_Q 112 @@ -3396,13 +3396,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define ZGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_MN 16 -#define SGEMM_DEFAULT_P 128 -#define DGEMM_DEFAULT_P 160 +#define SGEMM_DEFAULT_P 240 +#define DGEMM_DEFAULT_P 240 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q 352 -#define DGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 320 #define CGEMM_DEFAULT_Q 224 #define ZGEMM_DEFAULT_Q 112 diff --git a/test/Makefile b/test/Makefile index 715842b4d..56acf1c5b 100644 --- a/test/Makefile +++ b/test/Makefile @@ -260,6 +260,7 @@ endif FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) +CLDFLAGS = $(CFLAGS) $(LDFLAGS) ifeq ($(USE_OPENMP), 1) @@ -331,7 +332,7 @@ endif ifeq ($(BUILD_BFLOAT16),1) test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME) - $(CC) $(CFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + $(CC) $(CLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) endif ifeq ($(BUILD_COMPLEX),1)