diff --git a/.cirrus.yml b/.cirrus.yml
index 6c2baf8a0..b4c4870d0 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -29,7 +29,7 @@ task:
   - mkdir build
   - cd build
   - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
-  - make
+  - make -j 4
 
 task:
   name: AppleM1/GCC/MAKE/OPENMP
diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml
index 4a9bf98b6..f1bf8064c 100644
--- a/.github/workflows/loongarch64.yml
+++ b/.github/workflows/loongarch64.yml
@@ -16,13 +16,13 @@ jobs:
         include:
           - target: LOONGSONGENERIC
             triple:  loongarch64-unknown-linux-gnu
-            opts: NO_SHARED=1 TARGET=LOONGSONGENERIC
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC
           - target: LOONGSON3R5
             triple: loongarch64-unknown-linux-gnu
-            opts: NO_SHARED=1 TARGET=LOONGSON3R5
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5
           - target: LOONGSON2K1000
             triple: loongarch64-unknown-linux-gnu
-            opts: NO_SHARED=1 TARGET=LOONGSON2K1000
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
           - target: DYNAMIC_ARCH
             triple: loongarch64-unknown-linux-gnu
             opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC
@@ -40,8 +40,9 @@ jobs:
 
       - name: Download and install loongarch64-toolchain
         run: |
-          wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz
-          tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt
+          wget https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/download/8.1/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
+          #wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
+          tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt
 
       - name: Set env
         run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b65871f8..f3eac2edf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
 
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 25)
+set(OpenBLAS_PATCH_VERSION 25.dev)
 
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
@@ -249,20 +249,21 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago
   endif()
 endif()
 
-if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS)
+# Seems that this hack doesn't required since macOS 11 Big Sur
+if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
   set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
   if (NOT NOFORTRAN)
   set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
   set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
- "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
- "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
+ "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
+ "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
  "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
  "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
  "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
   else ()
   set (CMAKE_C_CREATE_SHARED_LIBRARY
-   "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
-   "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
+   "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
+   "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
    "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
   endif ()
 endif()
@@ -541,7 +542,7 @@ if(NOT NO_LAPACKE)
   ADD_CUSTOM_TARGET(genlapacke
   COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
   )
-  install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
+  install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 endif()
 
 # Install pkg-config files
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 71df13634..493747052 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -216,3 +216,6 @@ In chronological order:
   
 * Pablo Romero <https://github.com/pablorcum>
   * [2022-08] Fix building from sources for QNX
+
+* Mark Seminatore <https://github.com/mseminatore>
+  * [2023-11-09] Improve Windows threading performance scaling
\ No newline at end of file
diff --git a/Changelog.txt b/Changelog.txt
index e0fe0ca5a..b6139d6b7 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,49 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.26
+ 2-Jan-2024
+
+general:
+- improved the version of openblas.pc that is created by the CMAKE build
+- fixed a CMAKE-specific build problem on older versions of MacOS
+- worked around linking problems on old versions of MacOS
+- corrected installation location of the lapacke_mangling header in CMAKE builds
+- added type declarations for complex variables to the MSVC-specific parts of the LAPACK header
+- significantly sped up ?GESV for small problem sizes by introducing a lower bound for multithreading
+- imported additions and corrections from the Reference-LAPACK project:
+  - added new LAPACK functions for truncated QR with pivoting (Reference-LAPACK PRs 891&941)
+  - handle miscalculation of minimum work array size in corner cases (Reference-LAPACK PR 942)
+  - fixed use of uninitialized variables in ?GEDMD and improved inline documentation (PR 959)
+  - fixed use of uninitialized variables (and consequential failures) in ?BBCSD (PR 967)
+  - added tests for the recently introduced Dynamic Mode Decomposition functions (PR 736)
+  - fixed several memory leaks in the LAPACK testsuite (PR 953)
+  - fixed counting of testsuite results by the Python script (PR 954)
+   
+x86-64:
+- fixed computation of CASUM on SkylakeX and newer targets in the special
+  case that AVX512 is not supported by the compiler or operating environment
+- fixed potential undefined behaviour in the CASUM/ZASUM kernels for AVX512 targets
+- worked around a problem in the pre-AVX kernels for GEMV
+- sped up the thread management code on MS Windows
+
+arm64:
+- fixed building of the LAPACK testsuite with Xcode 15 on Apple M1 and newer
+- sped up the thread management code on MS Windows
+- sped up SGEMM and DGEMM on Neoverse V1 and N1
+- sped up ?DOT on SVE-capable targets
+- reduced the number of targets in DYNAMIC_ARCH builds by eliminating functionally equivalent ones
+- included support for Apple M1 and newer targets in DYNAMIC_ARCH builds
+
+power:
+- improved the SGEMM kernel for POWER10
+- fixed compilation with (very) old versions of gcc
+- fixed detection of old 32bit PPC targets in CMAKE-based builds
+- added autodetection of the POWERPC 7400 subtype
+- fixed CMAKE-based compilation for PPCG4 and PPC970 targets
+
+loongarch64:
+- added and improved optimized kernels for almost all BLAS functions
+
 ====================================================================
 Version 0.3.25
  12-Nov-2023
diff --git a/GotoBLAS_06WeirdPerformance.txt b/GotoBLAS_06WeirdPerformance.txt
index 05766e17b..0f7cec5c9 100644
--- a/GotoBLAS_06WeirdPerformance.txt
+++ b/GotoBLAS_06WeirdPerformance.txt
@@ -11,7 +11,7 @@
    operation is finished.
 
 
-2. Simlar problem may happen under virtual machine. If supervisor
+2. Similar problem may happen under virtual machine. If supervisor
    allocates different cores for each scheduling, BLAS performnace
    will be bad. This is because BLAS also utilizes all cache,
    unexpected re-schedule for different core may result of heavy
diff --git a/Makefile.power b/Makefile.power
index 95bada34f..3fa6d6faf 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -11,7 +11,19 @@ endif
 
 ifeq ($(CORE), POWER10)
 ifneq ($(C_COMPILER), PGI)
+ifeq ($(C_COMPILER), GCC)
+ifeq ($(GCCVERSIONGTEQ10), 1)
 CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+else ifneq ($(GCCVERSIONGT4), 1)
+$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
+CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
+else
+$(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended)
+CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
+endif
+else
+CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+endif
 ifeq ($(F_COMPILER), IBM)
 FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize
 else
diff --git a/Makefile.rule b/Makefile.rule
index fd44b1b12..58f02358e 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.25
+VERSION = 0.3.25.dev
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
diff --git a/Makefile.system b/Makefile.system
index 1b84195e4..e602eaf05 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -407,6 +407,7 @@ XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables |awk '/vers
 endif
 ifeq (x$(XCVER), x 15)
 CCOMMON_OPT += -Wl,-ld_classic
+FCOMMON_OPT += -Wl,-ld_classic
 endif
 endif
 
@@ -676,16 +677,12 @@ ifeq ($(ARCH), arm64)
 DYNAMIC_CORE =  ARMV8
 DYNAMIC_CORE += CORTEXA53
 DYNAMIC_CORE += CORTEXA57
-DYNAMIC_CORE += CORTEXA72
-DYNAMIC_CORE += CORTEXA73
 DYNAMIC_CORE += NEOVERSEN1
 ifneq ($(NO_SVE), 1)
 DYNAMIC_CORE += NEOVERSEV1
 DYNAMIC_CORE += NEOVERSEN2
 DYNAMIC_CORE += ARMV8SVE
 endif
-DYNAMIC_CORE += CORTEXA55
-DYNAMIC_CORE += FALKOR
 DYNAMIC_CORE += THUNDERX
 DYNAMIC_CORE += THUNDERX2T99
 DYNAMIC_CORE += TSV110
diff --git a/README.md b/README.md
index 3c4e38f18..b8d66ed42 100644
--- a/README.md
+++ b/README.md
@@ -196,20 +196,22 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
   ```sh
   make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran
   ```
-  (also known to work on C906)
+  (also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision)
 
 ### Support for multiple targets in a single library
 
 OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
 
-For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default.
+For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX, Cooper Lake, Sapphire Rapids. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default.
 
 `DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
 Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
 
-On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
+On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. If compiler support for SVE is available at build time, support for NeoverseN2, NeoverseV1 as well as generic ArmV8SVE targets is also enabled.
 
-For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
+For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additionally available if a sufficiently recent compiler is used for the build.
+
+on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support.
 
 The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the
 common code in the library, usually you will want to set this to the oldest model you expect to encounter.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 317bc504a..3ae8615a7 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -288,9 +288,9 @@ jobs:
      vmImage: 'ubuntu-latest'
   steps:
   - script: |
-        wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \
-          && echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc  alpine-chroot-install' | sha1sum -c \
-          || exit 1
+        wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.14.0/alpine-chroot-install \
+        && echo 'ccbf65f85cdc351851f8ad025bb3e65bae4d5b06  alpine-chroot-install' | sha1sum -c \
+        || exit 1
         alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
         sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
         alpine make DYNAMIC_ARCH=1 BINARY=64
diff --git a/benchmark/trsv.c b/benchmark/trsv.c
index 66ac3a3c7..e17c57157 100644
--- a/benchmark/trsv.c
+++ b/benchmark/trsv.c
@@ -127,7 +127,7 @@ int main(int argc, char *argv[]){
       long long muls = n*(n+1)/2.0;
       long long adds = (n - 1.0)*n/2.0;
 
-      fprintf(stderr, "%10d   %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg);
+      fprintf(stderr, "%10d :   %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg);
       if(a != NULL){
         free(a);
       }
diff --git a/c_check b/c_check
index b018c10a8..b5e4a9ad0 100755
--- a/c_check
+++ b/c_check
@@ -199,8 +199,7 @@ if [ "$architecture" = "loongarch64" ]; then
     tmpd="$(mktemp -d)"
     tmplsx="$tmpd/lsx.c"
     codelsx='"vadd.b $vr0, $vr0, $vr0"'
-    lsx_flags='-march=loongarch64 -mlsx'
-    printf "#include <lsxintrin.h>\n\n" >> "$tmplsx"
+    lsx_flags='-march=loongarch64'
     printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
     args="$lsx_flags -o $tmplsx.o $tmplsx"
     {
@@ -211,8 +210,7 @@ if [ "$architecture" = "loongarch64" ]; then
 
     tmplasx="$tmpd/lasx.c"
     codelasx='"xvadd.b $xr0, $xr0, $xr0"'
-    lasx_flags='-march=loongarch64 -mlasx'
-    printf "#include <lasxintrin.h>\n\n" >> "$tmplasx"
+    lasx_flags='-march=loongarch64'
     printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
     args="$lasx_flags -o $tmplasx.o $tmplasx"
     {
diff --git a/c_check.pl b/c_check.pl
index 7a860a211..d9c36793c 100644
--- a/c_check.pl
+++ b/c_check.pl
@@ -241,8 +241,7 @@ if (($architecture eq "loongarch64")) {
     } else {
 	$tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
 	$codelsx = '"vadd.b $vr0, $vr0, $vr0"';
-	$lsx_flags = "-march=loongarch64 -mlsx";
-	print $tmplsx "#include <lsxintrin.h>\n\n";
+	$lsx_flags = "-march=loongarch64";
 	print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n";
 
 	$args = "$lsx_flags -o $tmplsx.o $tmplsx";
@@ -257,8 +256,7 @@ if (($architecture eq "loongarch64")) {
 
 	$tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
 	$codelasx = '"xvadd.b $xr0, $xr0, $xr0"';
-	$lasx_flags = "-march=loongarch64 -mlasx";
-	print $tmplasx "#include <lasxintrin.h>\n\n";
+	$lasx_flags = "-march=loongarch64";
 	print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n";
 
 	$args = "$lasx_flags -o $tmplasx.o $tmplasx";
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index ebdc5a833..eb974456b 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -44,7 +44,7 @@ endif ()
 
 if (DYNAMIC_ARCH)
   if (ARM64)
-	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
+	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
     if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
           set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
     endif ()
diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index 7b4ef8947..2da941afb 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -36,9 +36,19 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS
 
     if (LOONGARCH64)
       if (BINARY64)
-        set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
+	CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
+        if(COMPILER_SUPPORT_LP64D_ABI)
+	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64d")
+	else()
+	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
+	endif ()
       else ()
-        set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
+	CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
+	if(COMPILER_SUPPORT_ILP32D_ABI)
+	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=ilp32d")
+	else()
+	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
+	endif ()
       endif ()
       set(BINARY_DEFINED 1)
     endif ()
@@ -282,6 +292,27 @@ if (${CORE} STREQUAL POWER8)
   endif ()
 endif ()
 
+# With -mcpu=970 added it compiles, but library is broken, at least on macOS. If someone
+# tests on *BSD or Linux and adds this flag, please make sure it is not used for macOS case.
+if (${CORE} STREQUAL PPC970)
+  if (NOT DYNAMIC_ARCH)
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -mtune=970 -maltivec -fno-fast-math")
+  endif ()
+  if (APPLE)
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -force_cpusubtype_ALL")
+  endif ()
+endif ()
+
+# -mcpu=G4 seems to work fine, but perhaps avoid it for the sake of consistency?
+if (${CORE} STREQUAL PPCG4)
+  if (NOT DYNAMIC_ARCH)
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -mtune=G4 -maltivec -fno-fast-math")
+  endif ()
+  if (APPLE)
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -force_cpusubtype_ALL")
+  endif ()
+endif ()
+
 if (NOT DYNAMIC_ARCH)
 	if (HAVE_AVX2)
         set (CCOMMON_OPT  "${CCOMMON_OPT} -mavx2")
diff --git a/cmake/fc.cmake b/cmake/fc.cmake
index c496f6368..5c30be843 100644
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@@ -61,9 +61,19 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
     endif ()
     if (LOONGARCH64)
       if (BINARY64)
-        set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
+	CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
+        if(COMPILER_SUPPORT_LP64D_ABI)
+	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d")
+	else()
+	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
+	endif ()
       else ()
-        set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
+	CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
+	if(COMPILER_SUPPORT_ILP32D_ABI)
+	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d")
+	else()
+	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
+	endif ()
       endif ()
     endif ()
     if (RISCV64)
diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index 22476f561..003a8b3c1 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -52,7 +52,7 @@ set(SLASRC
    sgebrd.f sgecon.f sgeequ.f sgees.f  sgeesx.f sgeev.f  sgeevx.f
    sgehd2.f sgehrd.f sgelq2.f sgelqf.f
    sgels.f  sgelsd.f sgelss.f sgelsy.f sgeql2.f sgeqlf.f
-   sgeqp3.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f
+   sgeqp3.f sgeqp3rk.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f
    sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f sgetc2.f
    sgetrf2.f sgetri.f
    sggbak.f sggbal.f
@@ -67,7 +67,7 @@ set(SLASRC
    slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f
    slansy.f slantb.f slantp.f slantr.f slanv2.f
    slapll.f slapmt.f
-   slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f
+   slaqgb.f slaqge.f slaqp2.f slaqps.f slaqp2rk.f slaqp3rk.f slaqsb.f slaqsp.f slaqsy.f
    slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
    slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
    slarf.f  slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
@@ -139,7 +139,7 @@ set(CLASRC
    cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f
    cgecon.f cgeequ.f cgees.f  cgeesx.f cgeev.f  cgeevx.f
    cgehd2.f cgehrd.f cgelq2.f cgelqf.f
-   cgels.f  cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f
+   cgels.f  cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f cgeqp3rk.f
    cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f cgerq2.f cgerqf.f
    cgesc2.f cgesdd.f cgesvd.f cgesvdx.f
    cgesvj.f cgejsv.f cgsvj0.f cgsvj1.f
@@ -173,7 +173,7 @@ set(CLASRC
    clanhb.f clanhe.f
    clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f
    clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f
-   claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
+   claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqp2rk.f claqp3rk.f claqsb.f
    claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
    claqz0.f claqz1.f claqz2.f claqz3.f
    claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
@@ -243,7 +243,7 @@ set(DLASRC
    dgebrd.f dgecon.f dgeequ.f dgees.f  dgeesx.f dgeev.f  dgeevx.f
    dgehd2.f dgehrd.f dgelq2.f dgelqf.f
    dgels.f  dgelsd.f dgelss.f dgelsy.f dgeql2.f dgeqlf.f
-   dgeqp3.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f
+   dgeqp3.f dgeqp3rk.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f
    dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f dgetc2.f
    dgetrf2.f dgetri.f
    dggbak.f dggbal.f
@@ -258,7 +258,7 @@ set(DLASRC
    dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f
    dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f
    dlapll.f dlapmt.f
-   dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
+   dlaqgb.f dlaqge.f dlaqp2.f dlaqp2rk.f dlaqp3rk.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
    dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
    dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
    dlarf.f  dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
@@ -331,7 +331,7 @@ set(ZLASRC
    zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f
    zgecon.f zgeequ.f zgees.f  zgeesx.f zgeev.f  zgeevx.f
    zgehd2.f zgehrd.f zgelq2.f zgelqf.f
-   zgels.f  zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
+   zgels.f  zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f zgeqp3rk.f
    zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
    zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvx.f
    zgesvj.f zgejsv.f zgsvj0.f zgsvj1.f
@@ -367,7 +367,7 @@ set(ZLASRC
    zlanhe.f
    zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f
    zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f
-   zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f
+   zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqp2rk.f zlaqp3rk.f zlaqps.f zlaqsb.f
    zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
    zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
    zlarcm.f zlarf.f  zlarfb.f zlarfb_gett.f
@@ -557,7 +557,7 @@ set(SLASRC
    sgebrd.c sgecon.c sgeequ.c sgees.c  sgeesx.c sgeev.c  sgeevx.c
    sgehd2.c sgehrd.c sgelq2.c sgelqf.c
    sgels.c  sgelsd.c sgelss.c sgelsy.c sgeql2.c sgeqlf.c
-   sgeqp3.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c
+   sgeqp3.c sgeqp3rk.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c
    sgesc2.c sgesdd.c sgesvd.c sgesvdx.c sgesvx.c sgetc2.c
    sgetrf2.c sgetri.c
    sggbak.c sggbal.c
@@ -571,7 +571,7 @@ set(SLASRC
    slangb.c slange.c slangt.c slanhs.c slansb.c slansp.c
    slansy.c slantb.c slantp.c slantr.c slanv2.c
    slapll.c slapmt.c
-   slaqgb.c slaqge.c slaqp2.c slaqps.c slaqsb.c slaqsp.c slaqsy.c
+   slaqgb.c slaqge.c slaqp2.c slaqp2rk.c slaqp3rk.c slaqps.c slaqsb.c slaqsp.c slaqsy.c
    slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c
    slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c
    slarf.c  slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c
@@ -643,7 +643,7 @@ set(CLASRC
    cgbtf2.c cgbtrf.c cgbtrs.c cgebak.c cgebal.c cgebd2.c cgebrd.c
    cgecon.c cgeequ.c cgees.c  cgeesx.c cgeev.c  cgeevx.c
    cgehd2.c cgehrd.c cgelq2.c cgelqf.c
-   cgels.c  cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c
+   cgels.c  cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c cgeqp3rk.c
    cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c
    cgesc2.c cgesdd.c cgesvd.c cgesvdx.c
    cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c
@@ -677,7 +677,7 @@ set(CLASRC
    clanhb.c clanhe.c
    clanhp.c clanhs.c clanht.c clansb.c clansp.c clansy.c clantb.c
    clantp.c clantr.c clapll.c clapmt.c clarcm.c claqgb.c claqge.c
-   claqhb.c claqhe.c claqhp.c claqp2.c claqps.c claqsb.c
+   claqhb.c claqhe.c claqhp.c claqp2.c claqp2rk.c claqp3rk.c claqps.c claqsb.c
    claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c
    claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c
    clarf.c  clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c
@@ -746,7 +746,7 @@ set(DLASRC
    dgebrd.c dgecon.c dgeequ.c dgees.c  dgeesx.c dgeev.c  dgeevx.c
    dgehd2.c dgehrd.c dgelq2.c dgelqf.c
    dgels.c  dgelsd.c dgelss.c dgelsy.c dgeql2.c dgeqlf.c
-   dgeqp3.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c
+   dgeqp3.c dgeqp3rk.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c
    dgesc2.c dgesdd.c dgesvd.c dgesvdx.c dgesvx.c dgetc2.c
    dgetrf2.c dgetri.c
    dggbak.c dggbal.c
@@ -760,7 +760,7 @@ set(DLASRC
    dlangb.c dlange.c dlangt.c dlanhs.c dlansb.c dlansp.c
    dlansy.c dlantb.c dlantp.c dlantr.c dlanv2.c
    dlapll.c dlapmt.c
-   dlaqgb.c dlaqge.c dlaqp2.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c
+   dlaqgb.c dlaqge.c dlaqp2.c dlaqp2rk.c dlaqp3rk.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c
    dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c
    dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c
    dlarf.c  dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c
@@ -833,7 +833,7 @@ set(ZLASRC
    zgbtf2.c zgbtrf.c zgbtrs.c zgebak.c zgebal.c zgebd2.c zgebrd.c
    zgecon.c zgeequ.c zgees.c  zgeesx.c zgeev.c  zgeevx.c
    zgehd2.c zgehrd.c zgelq2.c zgelqf.c
-   zgels.c  zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c
+   zgels.c  zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c zgeqp3rk.c
    zgeqr2.c zgeqr2p.c zgeqrf.c zgeqrfp.c zgerfs.c zgerq2.c zgerqf.c
    zgesc2.c zgesdd.c zgesvd.c zgesvdx.c zgesvx.c
    zgesvj.c zgejsv.c zgsvj0.c zgsvj1.c
@@ -868,7 +868,7 @@ set(ZLASRC
    zlanhe.c
    zlanhp.c zlanhs.c zlanht.c zlansb.c zlansp.c zlansy.c zlantb.c
    zlantp.c zlantr.c zlapll.c zlapmt.c zlaqgb.c zlaqge.c
-   zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqps.c zlaqsb.c
+   zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqp2rk.c zlaqp3rk.c zlaqps.c zlaqsb.c
    zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c
    zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c
    zlarcm.c zlarf.c  zlarfb.c zlarfb_gett.c
diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in
index 7e120af86..11e5606e5 100644
--- a/cmake/openblas.pc.in
+++ b/cmake/openblas.pc.in
@@ -5,7 +5,7 @@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
 openblas_config=USE_64BITINT=@INTERFACE64@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ 
 Name: OpenBLAS
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
-Version: @OPENBLAS_VERSION@
-URL: https://github.com/xianyi/OpenBLAS
+Version: @OpenBLAS_VERSION@
+URL: https://github.com/OpenMathLib/OpenBLAS
 Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} 
 Cflags: -I${includedir}
diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake
index 49b9863e3..e94497a04 100644
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -38,7 +38,7 @@ if(CMAKE_CL_64 OR MINGW64)
   endif()
 elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING))
   set(X86 1)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc.*"))
   set(POWER 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
   set(MIPS64 1)
@@ -46,7 +46,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
   set(LOONGARCH64 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*")
   set(RISCV64 1)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*"))
   if (NOT BINARY)
     if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
       set(X86_64 1)
@@ -109,7 +109,7 @@ else()
 endif ()
 
 if (NOT BINARY)
-  if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64)
+  if (X86_64 OR ARM64 OR MIPS64 OR LOONGARCH64 OR RISCV64 OR (POWER AND NOT (CMAKE_OSX_ARCHITECTURES STREQUAL "ppc")))
     set(BINARY 64)
   else ()
     set(BINARY 32)
diff --git a/common_loongarch64.h b/common_loongarch64.h
index ce1fcf091..b1426da79 100644
--- a/common_loongarch64.h
+++ b/common_loongarch64.h
@@ -119,12 +119,50 @@ static inline int WhereAmI(void){
 #define MOV     fmov.d
 #define CMOVT   fsel
 #define MTC     movgr2fr.d
+#define MTG     movfr2gr.d
 #define FABS    fabs.d
+#define FMIN    fmin.d
+#define FMINA   fmina.d
+#define FMAX    fmax.d
+#define FMAXA   fmaxa.d
 #define CMPEQ   fcmp.ceq.d
 #define CMPLE   fcmp.cle.d
 #define CMPLT   fcmp.clt.d
 #define NEG     fneg.d
+#define FFINT   ffint.d.l
+
+#define XVFSUB  xvfsub.d
+#define XVFADD  xvfadd.d
+#define XVFMUL  xvfmul.d
+#define XVFMADD xvfmadd.d
+#define XVFMIN  xvfmin.d
+#define XVFMINA xvfmina.d
+#define XVFMAX  xvfmax.d
+#define XVFMAXA xvfmaxa.d
+#define XVCMPEQ xvfcmp.ceq.d
+#define XVCMPLE xvfcmp.cle.d
+#define XVCMPLT xvfcmp.clt.d
+#define XVMUL   xvfmul.d
+#define XVMSUB  xvfmsub.d
+#define XVNMSUB xvfnmsub.d
+
+#define VFSUB  vfsub.d
+#define VFADD  vfadd.d
+#define VFMUL  vfmul.d
+#define VFMADD vfmadd.d
+#define VFMIN  vfmin.d
+#define VFMINA vfmina.d
+#define VFMAX  vfmax.d
+#define VFMAXA vfmaxa.d
+#define VCMPEQ vfcmp.ceq.d
+#define VCMPLE vfcmp.cle.d
+#define VCMPLT vfcmp.clt.d
+#define VMUL   vfmul.d
+#define VMSUB  vfmsub.d
+#define VNMSUB vfnmsub.d
+
 #else
+
 #define LD      fld.s
 #define ST      fst.s
 #define MADD    fmadd.s
@@ -137,11 +175,48 @@ static inline int WhereAmI(void){
 #define MOV     fmov.s
 #define CMOVT   fsel
 #define MTC     movgr2fr.w
+#define MTG     movfr2gr.s
 #define FABS    fabs.s
+#define FMIN    fmin.s
+#define FMINA   fmina.s
+#define FMAX    fmax.s
+#define FMAXA   fmaxa.s
 #define CMPEQ   fcmp.ceq.s
 #define CMPLE   fcmp.cle.s
 #define CMPLT   fcmp.clt.s
 #define NEG     fneg.s
+#define FFINT   ffint.s.l
+
+#define XVFSUB  xvfsub.s
+#define XVFADD  xvfadd.s
+#define XVFMUL  xvfmul.s
+#define XVFMADD xvfmadd.s
+#define XVFMIN  xvfmin.s
+#define XVFMINA xvfmina.s
+#define XVFMAX  xvfmax.s
+#define XVFMAXA xvfmaxa.s
+#define XVCMPEQ xvfcmp.ceq.s
+#define XVCMPLE xvfcmp.cle.s
+#define XVCMPLT xvfcmp.clt.s
+#define XVMUL   xvfmul.s
+#define XVMSUB  xvfmsub.s
+#define XVNMSUB xvfnmsub.s
+
+#define VFSUB  vfsub.s
+#define VFADD  vfadd.s
+#define VFMUL  vfmul.s
+#define VFMADD vfmadd.s
+#define VFMIN  vfmin.s
+#define VFMINA vfmina.s
+#define VFMAX  vfmax.s
+#define VFMAXA vfmaxa.s
+#define VCMPEQ vfcmp.ceq.s
+#define VCMPLE vfcmp.cle.s
+#define VCMPLT vfcmp.clt.s
+#define VMUL   vfmul.s
+#define VMSUB  vfmsub.s
+#define VNMSUB vfnmsub.s
+
 #endif /* defined(DOUBLE) */
 
 #if defined(__64BIT__) && defined(USE64BITINT)
diff --git a/common_thread.h b/common_thread.h
index 6e18d2a8e..9e7dae74a 100644
--- a/common_thread.h
+++ b/common_thread.h
@@ -111,8 +111,9 @@ typedef struct blas_queue {
   struct blas_queue *next;
 
 #if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__)
-  CRITICAL_SECTION lock;
-  HANDLE finish;
+  // CRITICAL_SECTION lock;
+  // HANDLE finish;
+  volatile int finished;
 #else
   pthread_mutex_t	 lock;
   pthread_cond_t	 finished;
diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c
index 7c389db27..0ad32ae4e 100644
--- a/cpuid_loongarch64.c
+++ b/cpuid_loongarch64.c
@@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CPU_LOONGSON3R5    1
 #define CPU_LOONGSON2K1000 2
 
-#define LA_HWCAP_LSX    (1<<4)
-#define LA_HWCAP_LASX   (1<<5)
+#define LA_HWCAP_LSX    (1U << 4)
+#define LA_HWCAP_LASX   (1U << 5)
 
 static char *cpuname[] = {
   "LOONGSONGENERIC",
@@ -64,11 +64,11 @@ static char *cpuname_lower[] = {
 
 int detect(void) {
 #ifdef __linux
-  int flag  = (int)getauxval(AT_HWCAP);
+  int hwcap  = (int)getauxval(AT_HWCAP);
 
-  if (flag & LA_HWCAP_LASX)
+  if (hwcap & LA_HWCAP_LASX)
     return CPU_LOONGSON3R5;
-  else if (flag & LA_HWCAP_LSX)
+  else if (hwcap & LA_HWCAP_LSX)
     return CPU_LOONGSON2K1000;
   else
     return CPU_GENERIC;
@@ -94,7 +94,9 @@ void get_subdirname(void) {
 }
 
 void get_cpuconfig(void) {
+  uint32_t hwcaps = 0;
   int d = detect();
+
   switch (d) {
     case CPU_LOONGSON3R5:
       printf("#define LOONGSON3R5\n");
@@ -129,6 +131,10 @@ void get_cpuconfig(void) {
       printf("#define L2_ASSOCIATIVE 16\n");
     break;
   }
+
+  hwcaps = (uint32_t)getauxval( AT_HWCAP );
+  if (hwcaps & LA_HWCAP_LSX)      printf("#define HAVE_LSX\n");
+  if (hwcaps & LA_HWCAP_LASX)     printf("#define HAVE_LASX\n");
 }
 
 void get_libname(void){
diff --git a/cpuid_power.c b/cpuid_power.c
index 2526e8d0e..1ced8930a 100644
--- a/cpuid_power.c
+++ b/cpuid_power.c
@@ -160,6 +160,7 @@ int detect(void){
   infoCount = HOST_BASIC_INFO_COUNT;
   host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, &infoCount);
 
+  if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7400) return CPUTYPE_PPCG4;
   if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7450) return CPUTYPE_PPCG4;
   if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970)  return CPUTYPE_PPC970;
 
diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 5bdfc1276..40ff85abc 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -51,15 +51,10 @@
 /* This is a thread implementation for Win32 lazy implementation */
 
 /* Thread server common information */
-typedef struct{
-  CRITICAL_SECTION lock;
-  HANDLE filled;
-  HANDLE killed;
 
-  blas_queue_t	*queue;    /* Parameter Pointer */
-  int		shutdown;  /* server shutdown flag */
-
-} blas_pool_t;
+static blas_queue_t *work_queue = NULL;
+static HANDLE kickoff_event = NULL;
+static CRITICAL_SECTION queue_lock;
 
 /* We need this global for checking if initialization is finished.   */
 int blas_server_avail = 0;
@@ -67,11 +62,19 @@ int blas_server_avail = 0;
 /* Local Variables */
 static BLASULONG server_lock       = 0;
 
-static blas_pool_t   pool;
 static HANDLE	    blas_threads   [MAX_CPU_NUMBER];
 static DWORD	    blas_threads_id[MAX_CPU_NUMBER];
+static volatile int thread_target;	// target num of live threads, volatile for cross-thread reads
 
-
+#if defined (__GNUC__) && (__GNUC__ < 6)
+	#define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch)
+#else
+	#if defined(_WIN64)
+		#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp)
+	#else
+		#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp)
+	#endif
+#endif
 
 static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 
@@ -202,14 +205,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 static DWORD WINAPI blas_thread_server(void *arg){
 
   /* Thread identifier */
-#ifdef SMP_DEBUG
   BLASLONG  cpu = (BLASLONG)arg;
-#endif
 
   void *buffer, *sa, *sb;
   blas_queue_t	*queue;
-  DWORD action;
-  HANDLE handles[] = {pool.filled, pool.killed};
 
   /* Each server needs each buffer */
   buffer   = blas_memory_alloc(2);
@@ -225,29 +224,44 @@ static DWORD WINAPI blas_thread_server(void *arg){
 #ifdef SMP_DEBUG
     fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
 #endif
+	// event raised when work is added to the queue
+	WaitForSingleObject(kickoff_event, INFINITE);
 
-    do {
-      action = WaitForMultipleObjects(2, handles, FALSE, INFINITE);
-    } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1));
-
-    if (action == WAIT_OBJECT_0 + 1) break;
+	if (cpu > thread_target - 2)
+	{
+		//printf("thread [%d] exiting.\n", cpu);
+		break;	// excess thread, so worker thread exits
+	}
 
 #ifdef SMP_DEBUG
     fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
 #endif
 
-    EnterCriticalSection(&pool.lock);
+#if 1
+    EnterCriticalSection(&queue_lock);
+
+    queue = work_queue;
+    if (queue)
+        work_queue = work_queue->next;
+
+    LeaveCriticalSection(&queue_lock);
+#else
+    volatile blas_queue_t* queue_next;
 
-    queue = pool.queue;
-    if (queue) pool.queue = queue->next;
+    INT_PTR prev_value;
+    do {
+        queue = (volatile blas_queue_t*)work_queue;
+        if (!queue)
+            break;
 
-    LeaveCriticalSection(&pool.lock);
+        queue_next = (volatile blas_queue_t*)queue->next;
+        prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue);
+    } while (prev_value != queue);
+#endif
 
     if (queue)  {
       int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
 
-      if (pool.queue) SetEvent(pool.filled);
-
       sa = queue -> sa;
       sb = queue -> sb;
 
@@ -331,14 +345,9 @@ static DWORD WINAPI blas_thread_server(void *arg){
 #ifdef SMP_DEBUG
     fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
 #endif
+	
+	queue->finished = 1;
 
-    EnterCriticalSection(&queue->lock);
-
-    queue -> status = BLAS_STATUS_FINISHED;
-
-    LeaveCriticalSection(&queue->lock);
-
-    SetEvent(queue->finish);
   }
 
   /* Shutdown procedure */
@@ -366,15 +375,16 @@ int blas_thread_init(void){
 #endif
 
   if (!blas_server_avail){
+	// create the kickoff Event
+	kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
 
-    InitializeCriticalSection(&pool.lock);
-    pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
-    pool.killed = CreateEvent(NULL, TRUE,  FALSE, NULL);
+	thread_target = blas_cpu_number;
 
-    pool.shutdown = 0;
-    pool.queue    = NULL;
+    InitializeCriticalSection(&queue_lock);
 
     for(i = 0; i < blas_cpu_number - 1; i++){
+	  //printf("thread_init: creating thread [%d]\n", i);
+
       blas_threads[i] = CreateThread(NULL, 0,
 				     blas_thread_server, (void *)i,
 				     0, &blas_threads_id[i]);
@@ -409,8 +419,6 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
   current = queue;
 
   while (current) {
-    InitializeCriticalSection(&current -> lock);
-    current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL);
     current -> position = pos;
 
 #ifdef CONSISTENT_FPCSR
@@ -418,23 +426,32 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
     __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode));
 #endif
 
+	current->finished = 0;
     current = current -> next;
     pos ++;
   }
 
-  EnterCriticalSection(&pool.lock);
+  EnterCriticalSection(&queue_lock);
+
+  if (!work_queue)
+  {
+      work_queue = queue;
+  }
+  else
+  {
+	  blas_queue_t *next_item = work_queue;
+
+      // find the end of the work queue
+      while (next_item)
+          next_item = next_item->next;
 
-  if (pool.queue) {
-    current = pool.queue;
-    while (current -> next) current = current -> next;
-    current -> next = queue;
-  } else {
-    pool.queue = queue;
+      // add new work to the end
+      next_item = queue;
   }
 
-  LeaveCriticalSection(&pool.lock);
+  LeaveCriticalSection(&queue_lock);
 
-  SetEvent(pool.filled);
+  SetEvent(kickoff_event);
 
   return 0;
 }
@@ -449,21 +466,26 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
 #ifdef SMP_DEBUG
     fprintf(STDERR, "Waiting Queue ..\n");
 #endif
+	  while (!queue->finished)
+		  YIELDING;
 
-      WaitForSingleObject(queue->finish, INFINITE);
-
-      CloseHandle(queue->finish);
-      DeleteCriticalSection(&queue -> lock);
-
-      queue = queue -> next;
-      num --;
+  	  queue = queue->next;
+      num--;
     }
 
 #ifdef SMP_DEBUG
     fprintf(STDERR, "Completely Done.\n\n");
 #endif
+	// if work was added to the queue after this batch we can't sleep the worker threads
+	// by resetting the event
+	EnterCriticalSection(&queue_lock);
 
-  return 0;
+	if (work_queue == NULL)
+		ResetEvent(kickoff_event);
+
+	LeaveCriticalSection(&queue_lock);
+
+	return 0;
 }
 
 /* Execute Threads */
@@ -512,8 +534,6 @@ int BLASFUNC(blas_thread_shutdown)(void){
 
   if (blas_server_avail){
 
-    SetEvent(pool.killed);
-
     for(i = 0; i < blas_num_threads - 1; i++){
       // Could also just use WaitForMultipleObjects
       DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
@@ -528,9 +548,6 @@ int BLASFUNC(blas_thread_shutdown)(void){
       CloseHandle(blas_threads[i]);
     }
 
-    CloseHandle(pool.filled);
-    CloseHandle(pool.killed);
-
     blas_server_avail = 0;
   }
 
@@ -552,23 +569,48 @@ void goto_set_num_threads(int num_threads)
 
 	if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
 
+	if (blas_server_avail && num_threads < blas_num_threads)	{
+		LOCK_COMMAND(&server_lock);
+
+		thread_target = num_threads;
+		
+		SetEvent(kickoff_event);
+
+		for (i = num_threads - 1; i < blas_num_threads - 1; i++) {
+			//printf("set_num_threads: waiting on thread [%d] to quit.\n", i);
+
+			WaitForSingleObject(blas_threads[i], INFINITE);
+
+			//printf("set_num_threads: thread [%d] has quit.\n", i);
+
+			CloseHandle(blas_threads[i]);
+		}
+
+		blas_num_threads = num_threads;
+		
+		ResetEvent(kickoff_event);
+
+		UNLOCK_COMMAND(&server_lock);
+	}
+
 	if (num_threads > blas_num_threads) {
 
 		LOCK_COMMAND(&server_lock);
 
+		thread_target = num_threads;
+
 		//increased_threads = 1;
 	    if (!blas_server_avail){
+			// create the kickoff Event
+			kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
 
-			InitializeCriticalSection(&pool.lock);
-			pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
-			pool.killed = CreateEvent(NULL, TRUE,  FALSE, NULL);
+			InitializeCriticalSection(&queue_lock);
 
-			pool.shutdown = 0;
-			pool.queue    = NULL;
 			blas_server_avail = 1;
 		}
 
 		for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
+			//printf("set_num_threads: creating thread [%d]\n", i);
 
 			blas_threads[i] = CreateThread(NULL, 0,
 				     blas_thread_server, (void *)i,
diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index 530d18115..803e0b5eb 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -122,10 +122,11 @@ extern gotoblas_t  gotoblas_CORTEXA55;
 #endif
 #else
 extern gotoblas_t  gotoblas_CORTEXA53;
+#define gotoblas_CORTEXA55 gotoblas_CORTEXA53
 extern gotoblas_t  gotoblas_CORTEXA57;
-extern gotoblas_t  gotoblas_CORTEXA72;
-extern gotoblas_t  gotoblas_CORTEXA73;
-extern gotoblas_t  gotoblas_FALKOR;
+#define gotoblas_CORTEXA72 gotoblas_CORTEXA57
+#define gotoblas_CORTEXA73 gotoblas_CORTEXA57
+#define gotoblas_FALKOR gotoblas_CORTEXA57
 extern gotoblas_t  gotoblas_THUNDERX;
 extern gotoblas_t  gotoblas_THUNDERX2T99;
 extern gotoblas_t  gotoblas_TSV110;
@@ -141,7 +142,6 @@ extern gotoblas_t  gotoblas_ARMV8SVE;
 #define gotoblas_ARMV8SVE   gotoblas_ARMV8
 #endif
 extern gotoblas_t  gotoblas_THUNDERX3T110;
-extern gotoblas_t  gotoblas_CORTEXA55;
 #endif
 
 extern void openblas_warning(int verbose, const char * msg);
@@ -247,6 +247,10 @@ static gotoblas_t *get_coretype(void) {
   int implementer, variant, part, arch, revision, midr_el1;
   char coremsg[128];
 
+#if defined (OS_DARWIN)
+  return &gotoblas_NEOVERSEN1;
+#endif
+	
 #if (!defined OS_LINUX && !defined OS_ANDROID)
   return NULL;
 #else
@@ -352,6 +356,9 @@ static gotoblas_t *get_coretype(void) {
           return &gotoblas_FALKOR;
       }
       break;
+    case 0x61: // Apple
+	return &gotoblas_NEOVERSEN1;
+      break;
     default:
       snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
       openblas_warning(1, coremsg);
diff --git a/driver/others/dynamic_loongarch64.c b/driver/others/dynamic_loongarch64.c
index 52f8bcb2f..44de59669 100644
--- a/driver/others/dynamic_loongarch64.c
+++ b/driver/others/dynamic_loongarch64.c
@@ -25,6 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 
+#include <sys/auxv.h>
 #include "common.h"
 
 extern gotoblas_t  gotoblas_LOONGSON3R5;
@@ -74,21 +75,15 @@ static gotoblas_t *force_coretype(char *coretype) {
   return NULL;
 }
 
-#define LASX_MASK       1<<7
-#define LSX_MASK        1<<6
-#define LOONGARCH_CFG2  0x02
+#define LA_HWCAP_LSX    (1U << 4)
+#define LA_HWCAP_LASX   (1U << 5)
 
 static gotoblas_t *get_coretype(void) {
-  int ret = 0;
-  __asm__ volatile (
-    "cpucfg %0, %1 \n\t"
-    : "+&r"(ret)
-    : "r"(LOONGARCH_CFG2)
-  );
-
-  if (ret & LASX_MASK)
+  int hwcap = (int)getauxval(AT_HWCAP);
+
+  if (hwcap & LA_HWCAP_LASX)
     return &gotoblas_LOONGSON3R5;
-  else if (ret & LSX_MASK)
+  else if (hwcap & LA_HWCAP_LSX)
     return &gotoblas_LOONGSON2K1000;
   else
     return &gotoblas_LOONGSONGENERIC;
diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index f0faf2baf..0454f186c 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -66,8 +66,7 @@ static int cpuid(void)
 #endif
     return CPU_UNKNOWN;
 }
-#else
-#if defined(C_PGI) || defined(__clang__)
+#elif defined(C_PGI) || defined(__clang__)
 /*
  * NV HPC compilers do not yet implement __builtin_cpu_is().
  * Fake a version here for use in the CPU detection code below.
@@ -196,13 +195,21 @@ static int cpuid(void)
 	cpu_type = pvrPOWER[i].cpu_type;
 	return (int)(cpu_type);
 }
-#endif  /* C_PGI */
+#elif !defined(__BUILTIN_CPU_SUPPORTS__)
+static int cpuid(void)
+{
+    return CPU_UNKNOWN;
+}
 #endif  /* _AIX */
 
 #ifndef __BUILTIN_CPU_SUPPORTS__
 #include <string.h>
 
-#if defined(_AIX) || (defined(__has_builtin) && !__has_builtin(__builtin_cpu_is))
+#ifndef __has_builtin
+#define __has_builtin(x)   0
+#endif
+
+#if defined(_AIX) || !__has_builtin(__builtin_cpu_is)
 static int __builtin_cpu_is(const char *arg)
 {
     static int ipinfo = -1;
@@ -227,7 +234,7 @@ static int __builtin_cpu_is(const char *arg)
 }
 #endif
 
-#if defined(_AIX) || (defined(__has_builtin) && !__has_builtin(__builtin_cpu_supports))
+#if defined(_AIX) || !__has_builtin(__builtin_cpu_supports)
 static int __builtin_cpu_supports(const char *arg)
 {
     return 0;
diff --git a/interface/lapack/gesv.c b/interface/lapack/gesv.c
index 175350329..546c2bed2 100644
--- a/interface/lapack/gesv.c
+++ b/interface/lapack/gesv.c
@@ -114,7 +114,14 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv,
 
 #ifdef SMP
   args.common = NULL;
-  args.nthreads = num_cpu_avail(4);
+#ifndef DOUBLE
+  if (args.m*args.n < 40000)
+#else
+  if (args.m*args.n < 10000)
+#endif
+	args.nthreads=1;
+  else
+         args.nthreads = num_cpu_avail(4);
 
   if (args.nthreads == 1) {
 #endif
diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX
index ccbce27e1..bc5999097 100644
--- a/kernel/arm64/KERNEL.A64FX
+++ b/kernel/arm64/KERNEL.A64FX
@@ -1,206 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-STRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-STRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-STRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-DTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-DTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-DTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-DTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-TRSMCOPYLN_M    = trsm_lncopy_sve.c
-TRSMCOPYLT_M    = trsm_ltcopy_sve.c
-TRSMCOPYUN_M    = trsm_uncopy_sve.c
-TRSMCOPYUT_M    = trsm_utcopy_sve.c
-
-CTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-CTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-CTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-CTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-ZTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-ZTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-ZTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMCOPYLN_M    = ztrsm_lncopy_sve.c
-ZTRSMCOPYLT_M    = ztrsm_ltcopy_sve.c
-ZTRSMCOPYUN_M    = ztrsm_uncopy_sve.c
-ZTRSMCOPYUT_M    = ztrsm_utcopy_sve.c
-
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = daxpy_thunderx2t99.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-SASUMKERNEL    = sasum_thunderx2t99.c
-DASUMKERNEL    = dasum_thunderx2t99.c
-CASUMKERNEL    = casum_thunderx2t99.c
-ZASUMKERNEL    = zasum_thunderx2t99.c
-
-SCOPYKERNEL    = copy_thunderx2t99.c
-DCOPYKERNEL    = copy_thunderx2t99.c
-CCOPYKERNEL    = copy_thunderx2t99.c
-ZCOPYKERNEL    = copy_thunderx2t99.c
-
-SSWAPKERNEL    = swap_thunderx2t99.S
-DSWAPKERNEL    = swap_thunderx2t99.S
-CSWAPKERNEL    = swap_thunderx2t99.S
-ZSWAPKERNEL    = swap_thunderx2t99.S
-
-ISAMAXKERNEL   = iamax_thunderx2t99.c
-IDAMAXKERNEL   = iamax_thunderx2t99.c
-ICAMAXKERNEL   = izamax_thunderx2t99.c
-IZAMAXKERNEL   = izamax_thunderx2t99.c
-
-SNRM2KERNEL    = scnrm2_thunderx2t99.c
-DNRM2KERNEL    = dznrm2_thunderx2t99.c
-CNRM2KERNEL    = scnrm2_thunderx2t99.c
-ZNRM2KERNEL    = dznrm2_thunderx2t99.c
-
-DDOTKERNEL     = dot.c
-SDOTKERNEL     = dot.c
-CDOTKERNEL     = zdot_thunderx2t99.c
-ZDOTKERNEL     = zdot_thunderx2t99.c
-DSDOTKERNEL    = dot.S
-
-DGEMM_BETA     = dgemm_beta.S
-SGEMM_BETA     = sgemm_beta.S
-
-SGEMMKERNEL    =  sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
-
-SGEMMINCOPY    =  gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c
-SGEMMITCOPY    =  gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c
-SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
-SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
-
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-STRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-STRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-STRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-STRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-SSYMMUCOPY_M    =  symm_ucopy_sve.c
-SSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
-DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPY    =  gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c
-DGEMMITCOPY    =  gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DTRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-DTRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-DTRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-DSYMMUCOPY_M    =  symm_ucopy_sve.c
-DSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-CGEMMINCOPY    =  gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
-CGEMMITCOPY    =  gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-CHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-CHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-CSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
-
-ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-ZGEMMINCOPY    =  gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
-ZGEMMITCOPY    =  gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-ZSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-ZSYMMLCOPY_M    =  zsymm_lcopy_sve.c
+include $(KERNELDIR)/KERNEL.ARMV8SVE
diff --git a/kernel/arm64/KERNEL.CORTEXA55 b/kernel/arm64/KERNEL.CORTEXA55
index e2e006770..574e98b8c 100644
--- a/kernel/arm64/KERNEL.CORTEXA55
+++ b/kernel/arm64/KERNEL.CORTEXA55
@@ -1,196 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = axpy.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-
-SASUMKERNEL    = asum.S
-DASUMKERNEL    = asum.S
-CASUMKERNEL    = casum.S
-ZASUMKERNEL    = zasum.S
-
-SCOPYKERNEL    = copy.S
-DCOPYKERNEL    = copy.S
-CCOPYKERNEL    = copy.S
-ZCOPYKERNEL    = copy.S
-
-SSWAPKERNEL    = swap.S
-DSWAPKERNEL    = swap.S
-CSWAPKERNEL    = swap.S
-ZSWAPKERNEL    = swap.S
-
-ISAMAXKERNEL   = iamax.S
-IDAMAXKERNEL   = iamax.S
-ICAMAXKERNEL   = izamax.S
-IZAMAXKERNEL   = izamax.S
-
-SNRM2KERNEL    = nrm2.S
-DNRM2KERNEL    = nrm2.S
-CNRM2KERNEL    = znrm2.S
-ZNRM2KERNEL    = znrm2.S
-
-ifneq ($(C_COMPILER), PGI)
-SDOTKERNEL   = ../generic/dot.c
-else
-SDOTKERNEL   = dot.S
-endif
-DDOTKERNEL   = dot.S
-ifneq ($(C_COMPILER), PGI)
-CDOTKERNEL   = zdot.S
-ZDOTKERNEL   = zdot.S
-else
-CDOTKERNEL = ../arm/zdot.c
-ZDOTKERNEL = ../arm/zdot.c
-endif
-DSDOTKERNEL  = dot.S
-
-DGEMM_BETA     = dgemm_beta.S
-SGEMM_BETA     = sgemm_beta.S
-
-ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
-SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
-STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
-else
-SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-endif
-ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
-ifeq ($(SGEMM_UNROLL_M), 16)
-SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
-else
-SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
-endif
-ifeq ($(SGEMM_UNROLL_M), 4)
-SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S
-else
-SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
-endif
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
-SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c
-DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
-
-ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
-
-ifeq ($(DGEMM_UNROLL_M), 8)
-DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
-DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
-else
-DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
-DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
-endif
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-ifeq ($(DGEMM_UNROLL_N), 4)
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-else
-DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
-endif
-
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c
-CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c
-ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+include $(KERNELDIR)/KERNEL.CORTEXA53
diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110
index 41cedc851..5d3bd69f7 100644
--- a/kernel/arm64/KERNEL.THUNDERX3T110
+++ b/kernel/arm64/KERNEL.THUNDERX3T110
@@ -1,184 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = daxpy_thunderx2t99.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
-SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
-SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
-
-ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
-
-ifeq ($(DGEMM_UNROLL_M), 8)
-DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
-DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
-else
-DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
-DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
-endif
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-ifeq ($(DGEMM_UNROLL_N), 4)
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-else
-DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
-endif
-
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-SASUMKERNEL    = sasum_thunderx2t99.c
-DASUMKERNEL    = dasum_thunderx2t99.c
-CASUMKERNEL    = casum_thunderx2t99.c
-ZASUMKERNEL    = zasum_thunderx2t99.c
-
-SCOPYKERNEL    = copy_thunderx2t99.c
-DCOPYKERNEL    = copy_thunderx2t99.c
-CCOPYKERNEL    = copy_thunderx2t99.c
-ZCOPYKERNEL    = copy_thunderx2t99.c
-
-SSWAPKERNEL    = swap_thunderx2t99.S
-DSWAPKERNEL    = swap_thunderx2t99.S
-CSWAPKERNEL    = swap_thunderx2t99.S
-ZSWAPKERNEL    = swap_thunderx2t99.S
-
-ISAMAXKERNEL   = iamax_thunderx2t99.c
-IDAMAXKERNEL   = iamax_thunderx2t99.c
-ICAMAXKERNEL   = izamax_thunderx2t99.c
-IZAMAXKERNEL   = izamax_thunderx2t99.c
-
-SNRM2KERNEL    = scnrm2_thunderx2t99.c
-CNRM2KERNEL    = scnrm2_thunderx2t99.c
-#DNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
-#ZNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
-DNRM2KERNEL    = dznrm2_thunderx2t99.c
-ZNRM2KERNEL    = dznrm2_thunderx2t99.c
-
-
-DDOTKERNEL     = dot.c
-SDOTKERNEL     = dot.c
-CDOTKERNEL     = zdot_thunderx2t99.c
-ZDOTKERNEL     = zdot_thunderx2t99.c
-DSDOTKERNEL    = dot.S
-
-ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
-DGEMMKERNEL    = dgemm_kernel_8x4_thunderx2t99.S
-endif
-
-ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
-SGEMMKERNEL    =  sgemm_kernel_16x4_thunderx2t99.S
-endif
-
-ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
-CGEMMKERNEL    =  cgemm_kernel_8x4_thunderx2t99.S
-endif
-
-ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
-ZGEMMKERNEL    =  zgemm_kernel_4x4_thunderx2t99.S
-endif
+include $(KERNELDIR)/KERNEL.THUNDERX2T99
diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c
index 9c057551e..16f4cd537 100644
--- a/kernel/arm64/dot_kernel_sve.c
+++ b/kernel/arm64/dot_kernel_sve.c
@@ -1,4 +1,5 @@
 /***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
 Copyright (c) 2022, Arm Ltd
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
@@ -30,37 +31,84 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <arm_sve.h>
 
 #ifdef DOUBLE
-#define SVE_TYPE svfloat64_t
-#define SVE_ZERO svdup_f64(0.0)
-#define SVE_WHILELT svwhilelt_b64
-#define SVE_ALL svptrue_b64()
-#define SVE_WIDTH svcntd()
+#define DTYPE "d"
+#define WIDTH "d"
+#define SHIFT "3"
 #else
-#define SVE_TYPE svfloat32_t
-#define SVE_ZERO svdup_f32(0.0)
-#define SVE_WHILELT svwhilelt_b32
-#define SVE_ALL svptrue_b32()
-#define SVE_WIDTH svcntw()
+#define DTYPE "s"
+#define WIDTH "w"
+#define SHIFT "2"
 #endif
 
-static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
-        SVE_TYPE acc_a = SVE_ZERO;
-        SVE_TYPE acc_b = SVE_ZERO;
+#define COUNT \
+"        cnt"WIDTH"    x9                                   \n"
+#define SETUP_TRUE \
+"        ptrue   p0."DTYPE"                              \n"
+#define OFFSET_INPUTS                                     \
+"        add     x12, %[X_], x9, lsl #"SHIFT"               \n" \
+"        add     x13, %[Y_], x9, lsl #"SHIFT"               \n"
+#define TAIL_WHILE                                        \
+"        whilelo p1."DTYPE", x8, x0                         \n"
+#define UPDATE(pg, x,y,out)                               \
+"        ld1"WIDTH"    { z2."DTYPE" }, "pg"/z, ["x", x8, lsl #"SHIFT"]  \n" \
+"        ld1"WIDTH"    { z3."DTYPE" }, "pg"/z, ["y", x8, lsl #"SHIFT"]  \n" \
+"        fmla    "out"."DTYPE", "pg"/m, z2."DTYPE", z3."DTYPE"      \n"
+#define SUM_VECTOR(v) \
+"        faddv   "DTYPE""v", p0, z"v"."DTYPE"                     \n"
+#define RET \
+"        fadd    %"DTYPE"[RET_], "DTYPE"1, "DTYPE"0                     \n"
 
-        BLASLONG sve_width = SVE_WIDTH;
+#define DOT_KERNEL                                        \
+        COUNT                                             \
+"        mov     z1.d, #0                             \n" \
+"        mov     z0.d, #0                             \n" \
+"        mov     x8, #0                               \n" \
+"        movi    d1, #0x0                             \n" \
+        SETUP_TRUE                                        \
+"        neg     x10, x9, lsl #1                      \n" \
+"        ands    x11, x10, x0                         \n" \
+"        b.eq    2f // skip_2x                        \n" \
+        OFFSET_INPUTS                                     \
+"1: // vector_2x                                      \n" \
+        UPDATE("p0", "%[X_]", "%[Y_]", "z1") \
+        UPDATE("p0", "x12", "x13", "z0") \
+"        sub     x8, x8, x10                          \n" \
+"        cmp     x8, x11                              \n" \
+"        b.lo    1b // vector_2x                      \n" \
+        SUM_VECTOR("1") \
+"2: // skip_2x                                        \n" \
+"        neg     x10, x9                              \n" \
+"        and     x10, x10, x0                         \n" \
+"        cmp     x8, x10                              \n" \
+"        b.hs    4f // tail                           \n" \
+"3: // vector_1x                                      \n" \
+        UPDATE("p0", "%[X_]", "%[Y_]", "z0")              \
+"        add     x8, x8, x9                           \n" \
+"        cmp     x8, x10                              \n" \
+"        b.lo    3b // vector_1x                      \n" \
+"4: // tail                                           \n" \
+"        cmp     x10, x0                              \n" \
+"        b.eq    5f // end                            \n" \
+        TAIL_WHILE                                        \
+        UPDATE("p1", "%[X_]", "%[Y_]", "z0")              \
+"5: // end                                            \n" \
+        SUM_VECTOR("0") \
+        RET
 
-        for (BLASLONG i = 0; i < n; i += sve_width * 2) {
-                svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n);
-                svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n);
+static
+FLOAT
+dot_kernel_sve(BLASLONG n, FLOAT* x, FLOAT* y)
+{
+  FLOAT ret;
 
-                SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
-                SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);
-                SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]);
-                SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]);
+  asm(DOT_KERNEL
+      :
+        [RET_] "=&w" (ret)
+      :
+        [N_] "r" (n),
+        [X_] "r" (x),
+        [Y_] "r" (y)
+      :);
 
-                acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a);
-                acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b);
-        }
-
-        return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b);
+  return ret;
 }
diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
new file mode 100644
index 000000000..c365e9a75
--- /dev/null
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -0,0 +1,110 @@
+ifndef NO_LSX
+
+SDOTKERNEL  = dot_lsx.S
+DSDOTKERNEL = dot_lsx.S
+DDOTKERNEL  = dot_lsx.S
+CDOTKERNEL  = cdot_lsx.S
+ZDOTKERNEL  = cdot_lsx.S
+
+SSCALKERNEL  = scal_lsx.S
+DSCALKERNEL  = scal_lsx.S
+CSCALKERNEL  = cscal_lsx.S
+ZSCALKERNEL  = cscal_lsx.S
+
+SAMAXKERNEL =  amax_lsx.S
+DAMAXKERNEL =  amax_lsx.S
+CAMAXKERNEL =  camax_lsx.S
+
+SAMINKERNEL =  amin_lsx.S
+DAMINKERNEL =  amin_lsx.S
+CAMINKERNEL =  camin_lsx.S
+
+SMAXKERNEL  =  max_lsx.S
+DMAXKERNEL  =  max_lsx.S
+
+SMINKERNEL  =  min_lsx.S
+DMINKERNEL  =  min_lsx.S
+
+ISMAXKERNEL =  imax_lsx.S
+IDMAXKERNEL =  imax_lsx.S
+
+ISMINKERNEL =  imin_lsx.S
+IDMINKERNEL =  imin_lsx.S
+
+ISAMAXKERNEL = iamax_lsx.S
+IDAMAXKERNEL = iamax_lsx.S
+ICAMAXKERNEL = icamax_lsx.S
+IZAMAXKERNEL = icamax_lsx.S
+
+ISAMINKERNEL = iamin_lsx.S
+IDAMINKERNEL = iamin_lsx.S
+ICAMINKERNEL = icamin_lsx.S
+IZAMINKERNEL = icamin_lsx.S
+
+SCOPYKERNEL =  copy_lsx.S
+DCOPYKERNEL =  copy_lsx.S
+CCOPYKERNEL =  ccopy_lsx.S
+ZCOPYKERNEL =  ccopy_lsx.S
+
+SSWAPKERNEL =  swap_lsx.S
+DSWAPKERNEL =  swap_lsx.S
+
+SAXPYKERNEL =  axpy_lsx.S
+DAXPYKERNEL =  axpy_lsx.S
+CAXPYKERNEL =  caxpy_lsx.S
+ZAXPYKERNEL =  caxpy_lsx.S
+
+SAXPBYKERNEL = axpby_lsx.S
+DAXPBYKERNEL = axpby_lsx.S
+
+SSUMKERNEL  =  sum_lsx.S
+DSUMKERNEL  =  sum_lsx.S
+
+SASUMKERNEL =  asum_lsx.S
+DASUMKERNEL =  asum_lsx.S
+CASUMKERNEL =  casum_lsx.S
+ZASUMKERNEL =  casum_lsx.S
+
+SROTKERNEL  =  rot_lsx.S
+DROTKERNEL  =  rot_lsx.S
+CROTKERNEL  =  crot_lsx.S
+ZROTKERNEL  =  crot_lsx.S
+
+SNRM2KERNEL =  snrm2_lsx.S
+DNRM2KERNEL =  dnrm2_lsx.S
+CNRM2KERNEL =  cnrm2_lsx.S
+ZNRM2KERNEL =  znrm2_lsx.S
+
+CSWAPKERNEL = cswap_lsx.S
+ZSWAPKERNEL = cswap_lsx.S
+
+CSUMKERNEL = csum_lsx.S
+ZSUMKERNEL = csum_lsx.S
+
+DGEMMKERNEL    = dgemm_kernel_8x4.S
+DGEMMINCOPY    = dgemm_ncopy_8_lsx.S
+DGEMMITCOPY    = dgemm_tcopy_8_lsx.S
+DGEMMONCOPY    = dgemm_ncopy_4_lsx.S
+DGEMMOTCOPY    = dgemm_tcopy_4_lsx.S
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+CGEMMKERNEL  = cgemm_kernel_2x2_lsx.S
+CGEMMONCOPY  = cgemm_ncopy_2_lsx.S
+CGEMMOTCOPY  = cgemm_tcopy_2_lsx.S
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 011e8b89e..68360faaf 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -1,4 +1,86 @@
 ifndef NO_LASX
+
+SDOTKERNEL  = dot_lasx.S
+DSDOTKERNEL = dot_lasx.S
+DDOTKERNEL  = dot_lasx.S
+CDOTKERNEL  = cdot_lasx.S
+ZDOTKERNEL  = cdot_lasx.S
+
+SSCALKERNEL  = scal_lasx.S
+DSCALKERNEL  = scal_lasx.S
+CSCALKERNEL  = cscal_lasx.S
+ZSCALKERNEL  = cscal_lasx.S
+
+SAMAXKERNEL =  amax_lasx.S
+DAMAXKERNEL =  amax_lasx.S
+CAMAXKERNEL =  camax_lasx.S
+
+SAMINKERNEL =  amin_lasx.S
+DAMINKERNEL =  amin_lasx.S
+CAMINKERNEL =  camin_lasx.S
+
+SMAXKERNEL  =  max_lsx.S
+DMAXKERNEL =   max_lsx.S
+
+SMINKERNEL =   min_lsx.S
+DMINKERNEL =   min_lsx.S
+
+ISMAXKERNEL =  imax_lasx.S
+IDMAXKERNEL =  imax_lasx.S
+
+ISMINKERNEL =  imin_lasx.S
+IDMINKERNEL =  imin_lasx.S
+
+ISAMAXKERNEL = iamax_lasx.S
+IDAMAXKERNEL = iamax_lasx.S
+ICAMAXKERNEL = icamax_lasx.S
+IZAMAXKERNEL = icamax_lasx.S
+
+ISAMINKERNEL = iamin_lasx.S
+IDAMINKERNEL = iamin_lasx.S
+ICAMINKERNEL = icamin_lasx.S
+IZAMINKERNEL = icamin_lasx.S
+
+SCOPYKERNEL =  copy_lasx.S
+DCOPYKERNEL =  copy_lasx.S
+CCOPYKERNEL =  ccopy_lasx.S
+ZCOPYKERNEL =  ccopy_lasx.S
+
+SSWAPKERNEL =  swap_lasx.S
+DSWAPKERNEL =  swap_lasx.S
+
+SAXPYKERNEL =  axpy_lasx.S
+DAXPYKERNEL =  axpy_lasx.S
+CAXPYKERNEL =  caxpy_lasx.S
+ZAXPYKERNEL =  caxpy_lasx.S
+
+SAXPBYKERNEL = axpby_lasx.S
+DAXPBYKERNEL = axpby_lasx.S
+
+SSUMKERNEL  =  sum_lasx.S
+DSUMKERNEL  =  sum_lasx.S
+
+SASUMKERNEL =  asum_lasx.S
+DASUMKERNEL =  asum_lasx.S
+CASUMKERNEL =  casum_lasx.S
+ZASUMKERNEL =  casum_lasx.S
+
+SROTKERNEL  =  rot_lasx.S
+DROTKERNEL  =  rot_lasx.S
+CROTKERNEL  =  crot_lasx.S
+ZROTKERNEL  =  crot_lasx.S
+
+SNRM2KERNEL =  snrm2_lasx.S
+DNRM2KERNEL =  dnrm2_lasx.S
+CNRM2KERNEL =  cnrm2_lasx.S
+ZNRM2KERNEL =  znrm2_lasx.S
+
+CSWAPKERNEL = cswap_lasx.S
+ZSWAPKERNEL = cswap_lasx.S
+
+CSUMKERNEL = csum_lasx.S
+ZSUMKERNEL = csum_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
@@ -25,13 +107,35 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 SGEMVNKERNEL = sgemv_n_8_lasx.S
 SGEMVTKERNEL = sgemv_t_8_lasx.S
 
+CGEMMKERNEL  = cgemm_kernel_2x2_lsx.S
+CGEMMONCOPY  = cgemm_ncopy_2_lsx.S
+CGEMMOTCOPY  = cgemm_tcopy_2_lsx.S
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+ZGEMMKERNEL  = zgemm_kernel_2x2_lasx.S
+ZGEMMONCOPY  = zgemm_ncopy_2_lasx.S
+ZGEMMOTCOPY  = zgemm_tcopy_2_lasx.S
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
 DTRSMKERNEL_LN  = dtrsm_kernel_LN_16x4_lasx.S
 DTRSMKERNEL_LT  = dtrsm_kernel_LT_16x4_lasx.S
 DTRSMKERNEL_RN  = dtrsm_kernel_RN_16x4_lasx.S
 DTRSMKERNEL_RT  = dtrsm_kernel_RT_16x4_lasx.S
-endif
 
 STRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 STRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+endif
diff --git a/kernel/loongarch64/amax_lasx.S b/kernel/loongarch64/amax_lasx.S
new file mode 100644
index 000000000..e964d4ddb
--- /dev/null
+++ b/kernel/loongarch64/amax_lasx.S
@@ -0,0 +1,232 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $xr0
+#define VM1 $xr1
+#define VM2 $xr2
+#define VX0 $xr3
+#define VX1 $xr4
+#define VX2 $xr5
+#define VX3 $xr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    xvldrepl.d VM0, X, 0
+#else
+    xvldrepl.w VM0, X, 0
+#endif
+    XVFSUB VM0, VM0, VM0
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 4
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    xvld VX2, X, 64
+    xvld VX3, X, 96
+    addi.d I, I, -1
+    addi.d X, X, 128
+    XVFMAXA   VM1, VX0, VX1
+    XVFMAXA   VM2, VX2, VX3
+    XVFMAXA   VM0, VM0, VM1
+    XVFMAXA   VM0, VM0, VM2
+#else
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    addi.d I, I, -1
+    addi.d X, X, 64
+    XVFMAXA   VM1, VX0, VX1
+    XVFMAXA   VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMAXA   VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMAXA   VM1, VX0, VX1
+    XVFMAXA   VM2, VX2, VX3
+    XVFMAXA   VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 0x1
+    XVFMAXA   VM0, VM0, VM1
+    .align 3
+
+.L11:
+    andi  I, N, 0x0f
+    bge $r0, I, .L13
+    .align 3
+
+.L12: /* 0 < N < 16 */
+    LD    $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMAXA $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    FABS $f0, $f0
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 0
+    xvinsgr2vr.w VM1, t2, 1
+    xvinsgr2vr.w VM1, t3, 2
+    xvinsgr2vr.w VM1, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 4
+    xvinsgr2vr.w VM1, t2, 5
+    xvinsgr2vr.w VM1, t3, 6
+    xvinsgr2vr.w VM1, t4, 7
+    xvfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMAXA   VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMAXA   VM1, VX0, VX1
+    XVFMAXA   VM2, VX2, VX3
+    XVFMAXA   VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 1
+    XVFMAXA   VM0, VM0, VM1
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24: /* 0 < N < 8 */
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMAXA $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    FABS $f0, $f0
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/amax_lsx.S b/kernel/loongarch64/amax_lsx.S
new file mode 100644
index 000000000..fb3b77a0e
--- /dev/null
+++ b/kernel/loongarch64/amax_lsx.S
@@ -0,0 +1,231 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $vr0
+#define VM1 $vr1
+#define VM2 $vr2
+#define VX0 $vr3
+#define VX1 $vr4
+#define VX2 $vr5
+#define VX3 $vr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    vldrepl.d VM0, X, 0
+#else
+    vldrepl.w VM0, X, 0
+#endif
+    VFSUB VM0, VM0, VM0
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 3
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vld VX2, X, 32
+    vld VX3, X, 48
+    addi.d I, I, -1
+    addi.d X, X, 64
+    VFMAXA   VM1, VX0, VX1
+    VFMAXA   VM2, VX2, VX3
+    VFMAXA   VM0, VM0, VM1
+    VFMAXA   VM0, VM0, VM2
+#else
+    vld VX0, X, 0
+    vld VX1, X, 16
+    addi.d I, I, -1
+    addi.d X, X, 32
+    VFMAXA   VM1, VX0, VX1
+    VFMAXA   VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMAXA   VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMAXA   VM1, VX0, VX1
+    VFMAXA   VM2, VX2, VX3
+    VFMAXA   VM0, VM1, VM2
+#endif
+    .align 3
+
+.L11:
+    andi I, N, 7
+    bge $r0, I, .L13
+    .align 3
+
+.L12:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMAXA $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    FABS $f0, $f0
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM1, VM1, VM2
+    vfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmaxa.s VM1, VX0, VX1
+    vfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMAXA   VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMAXA   VM1, VX0, VX1
+    VFMAXA   VM2, VX2, VX3
+    VFMAXA   VM0, VM1, VM2
+#endif
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMAXA $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    FABS $f0, $f0
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/amin_lasx.S b/kernel/loongarch64/amin_lasx.S
new file mode 100644
index 000000000..0a4359002
--- /dev/null
+++ b/kernel/loongarch64/amin_lasx.S
@@ -0,0 +1,232 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $xr0
+#define VM1 $xr1
+#define VM2 $xr2
+#define VX0 $xr3
+#define VX1 $xr4
+#define VX2 $xr5
+#define VX3 $xr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    xvldrepl.d VM0, X, 0
+#else
+    xvldrepl.w VM0, X, 0
+#endif
+    XVFSUB VM0, VM0, VM0
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 4
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    xvld VX2, X, 64
+    xvld VX3, X, 96
+    addi.d I, I, -1
+    addi.d X, X, 128
+    XVFMINA   VM1, VX0, VX1
+    XVFMINA   VM2, VX2, VX3
+    XVFMINA   VM0, VM0, VM1
+    XVFMINA   VM0, VM0, VM2
+#else
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    addi.d I, I, -1
+    addi.d X, X, 64
+    XVFMINA   VM1, VX0, VX1
+    XVFMINA   VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMINA   VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMINA   VM1, VX0, VX1
+    XVFMINA   VM2, VX2, VX3
+    XVFMINA   VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 0x1
+    XVFMINA   VM0, VM0, VM1
+    .align 3
+
+.L11:
+    andi  I, N, 0x0f
+    bge $r0, I, .L13
+    .align 3
+
+.L12: /* 0 < N < 16 */
+    LD    $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMINA $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    FABS $f0, $f0
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 0
+    xvinsgr2vr.w VM1, t2, 1
+    xvinsgr2vr.w VM1, t3, 2
+    xvinsgr2vr.w VM1, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 4
+    xvinsgr2vr.w VM1, t2, 5
+    xvinsgr2vr.w VM1, t3, 6
+    xvinsgr2vr.w VM1, t4, 7
+    xvfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMINA   VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMINA   VM1, VX0, VX1
+    XVFMINA   VM2, VX2, VX3
+    XVFMINA   VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 1
+    XVFMINA   VM0, VM0, VM1
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24: /* 0 < N < 8 */
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMINA $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    FABS $f0, $f0
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S
new file mode 100644
index 000000000..644caf43c
--- /dev/null
+++ b/kernel/loongarch64/amin_lsx.S
@@ -0,0 +1,232 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $vr0
+#define VM1 $vr1
+#define VM2 $vr2
+#define VX0 $vr3
+#define VX1 $vr4
+#define VX2 $vr5
+#define VX3 $vr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    vldrepl.d VM0, X, 0
+#else
+    vldrepl.w VM0, X, 0
+#endif
+    VFSUB VM0, VM0, VM0
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 3
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vld VX2, X, 32
+    vld VX3, X, 48
+    addi.d I, I, -1
+    addi.d X, X, 64
+    VFMINA   VM1, VX0, VX1
+    VFMINA   VM2, VX2, VX3
+    VFMINA   VM0, VM0, VM1
+    VFMINA   VM0, VM0, VM2
+#else
+    vld VX0, X, 0
+    vld VX1, X, 16
+    addi.d I, I, -1
+    addi.d X, X, 32
+    VFMINA   VM1, VX0, VX1
+    VFMINA   VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMINA   VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMINA   VM1, VX0, VX1
+    VFMINA   VM2, VX2, VX3
+    VFMINA   VM0, VM1, VM2
+#endif
+    .align 3
+
+.L11:
+    andi I, N, 7
+    bge $r0, I, .L13
+    .align 3
+
+.L12:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMINA $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    FABS $f0, $f0
+    SUB $f0, $f0, $f0
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM1, VM1, VM2
+    vfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmaxa.s VM1, VX0, VX1
+    vfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMINA   VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMINA   VM1, VX0, VX1
+    VFMINA   VM2, VX2, VX3
+    VFMINA   VM0, VM1, VM2
+#endif
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMINA $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    FABS $f0, $f0
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/asum_lasx.S b/kernel/loongarch64/asum_lasx.S
new file mode 100644
index 000000000..9a2c031f3
--- /dev/null
+++ b/kernel/loongarch64/asum_lasx.S
@@ -0,0 +1,257 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define VT0    $xr23
+#define VT1    $xr22
+#define res1   $xr16
+#define res2   $xr17
+#define res0   $xr18
+#define neg1   $xr19
+
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    xvxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+#ifdef DOUBLE
+    li.d t1, -1
+    xvreplgr2vr.d neg1, t1
+    xvffint.d.l neg1, neg1
+#else
+    li.w t1, -1
+    xvreplgr2vr.w neg1, t1
+    xvffint.s.w neg1, neg1
+#endif
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    xvld VX0, X, 0 * SIZE
+    xvfmul.s VX2, neg1, VX0
+    xvfcmp.clt.s VT0, VX0, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvfadd.s res1, VX0, res1
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD    $f12, X, 0 * SIZE
+    FABS  $f12, $f12
+    ADD   $f16, $f12,  $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfmul.s VX2, neg1, VX0
+    xvfcmp.clt.s VT0, VX0, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvfadd.s res1, VX0, res1
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f12, X, 0 * SIZE
+    FABS $f12, $f12
+    ADD  $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV    $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/asum_lsx.S b/kernel/loongarch64/asum_lsx.S
new file mode 100644
index 000000000..512b01404
--- /dev/null
+++ b/kernel/loongarch64/asum_lsx.S
@@ -0,0 +1,258 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define VT0    $vr23
+#define VT1    $vr22
+#define res1   $vr16
+#define res2   $vr17
+#define res0   $vr18
+#define neg1   $vr19
+
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    vxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+#ifdef DOUBLE
+    li.d t1, -1
+    vreplgr2vr.d neg1, t1
+    vffint.d.l neg1, neg1
+#else
+    li.w t1, -1
+    vreplgr2vr.w neg1, t1
+    vffint.s.w neg1, neg1
+#endif
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res2, VX0, VX1
+    vfadd.s res1, res1, res2
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD   $f12, X, 0 * SIZE
+    FABS $f12, $f12
+    ADD  $f16, $f12, $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res2, VX0, VX1
+    vfadd.s res1, res1, res2
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f12, X, 0 * SIZE
+    FABS $f12, $f12
+    ADD  $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV    $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/axpby_lasx.S b/kernel/loongarch64/axpby_lasx.S
new file mode 100644
index 000000000..f1d99cd3b
--- /dev/null
+++ b/kernel/loongarch64/axpby_lasx.S
@@ -0,0 +1,1050 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define ALPHA  $f0
+#define X      $r5
+#define INCX   $r6
+#define BETA   $f1
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VXA    $xr23
+#define VXB    $xr9
+#define VXZ    $xr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.s.l a1, a1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    MTG  t1, ALPHA
+    MTG  t2, BETA
+    MTG  t3, a1
+#ifdef DOUBLE
+    xvreplgr2vr.d VXA, t1
+    xvreplgr2vr.d VXB, t2
+    xvreplgr2vr.d VXZ, t3
+#else
+    xvreplgr2vr.w VXA, t1
+    xvreplgr2vr.w VXB, t2
+    xvreplgr2vr.w VXZ, t3
+#endif
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ $fcc0, ALPHA, a1
+    bcnez $fcc0, .L110
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
+    b .L111 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L110:
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L114 // ALPHA==0 BETA==0
+    b .L113 // ALPHA==0 BETA!=0
+    .align 3
+
+.L111: // ALPHA!=0 BETA!=0
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvfmul.d VX0, VX0, VXA
+    xvfmul.d VX1, VX1, VXA
+    xvfmadd.d VX2, VX2, VXB, VX0
+    xvfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvfmadd.s VX2, VX2, VXB, VX0
+    xvst VX2, Y, 0 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // ALPHA!=0 BETA==0
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VX0, VX0, VXA
+    xvfmul.d VX1, VX1, VXA
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 4 * SIZE
+#else
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvst VX0, Y, 0 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // ALPHA==0 BETA!=0
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX3, Y, 4 * SIZE
+    xvfmul.d VX2, VX2, VXB
+    xvfmul.d VX3, VX3, VXB
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+#else
+    xvfmul.s VX2, VX2, VXB
+    xvst VX2, Y, 0 * SIZE
+#endif
+    addi.d  I, I, -1
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // ALPHA==0 BETA==0
+    xvst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvst VXZ, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    CMPEQ $fcc0, ALPHA, a1
+    bcnez $fcc0, .L120
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
+    b .L121 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L120:
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L124 // ALPHA==0 BETA==0
+    b .L123 // ALPHA==0 BETA!=0
+    .align 3
+
+.L121: // ALPHA!=0 BETA!=0
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX0, VX0, VXA
+    xvld VX1, X, 4 * SIZE
+    xvfmadd.d VX2, VX2, VXB, VX0
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    xvstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvfmul.d VX1, VX1, VXA
+    xvfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    xvstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX0, VX0, VXA
+    xvfmadd.s VX2, VX2, VXB, VX0
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L122: // ALPHA!=0 BETA==0
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VX0, VX0, VXA
+    xvfmul.d VX1, VX1, VXA
+    xvstelm.d VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 3
+#else
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L122
+    b .L997
+    .align 3
+
+.L123: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX2, VX2, VXB
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    xvstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvfmul.d VX3, VX3, VXB
+    xvstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX2, VX2, VXB
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L123
+    b .L997
+    .align 3
+
+.L124: // ALPHA==0 BETA==0
+#ifdef DOUBLE
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+#else
+    xvstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L124
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    CMPEQ $fcc0, ALPHA, a1
+    bcnez $fcc0, .L210
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
+    b .L211 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L210:
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L214 // ALPHA==0 BETA==0
+    b .L213 // ALPHA==0 BETA!=0
+    .align 3
+
+.L211: // ALPHA!=0 BETA!=0
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VX0, VXA, VX0
+    xvfmadd.d VX2, VX2, VXB, VX0
+    xvld VX3, Y, 4 * SIZE
+    xvst VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VX1, VX1, VXA
+    xvfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    xvst VX3, Y, 4 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VX0, VXA, VX0
+    xvfmadd.s VX2, VX2, VXB, VX0
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L212: // ALPHA!=0 BETA==0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VX0, VXA, VX0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvst VX0, Y, 0 * SIZE
+    xvfmul.d VX1, VX1, VXA
+    addi.d  I, I, -1
+    xvst VX1, Y, 4 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VX0, VXA, VX0
+    addi.d  I, I, -1
+    xvst VX0, Y, 0 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L212
+    b .L997
+    .align 3
+
+.L213: // ALPHA==0 BETA!=0
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX3, Y, 4 * SIZE
+    xvfmul.d VX2, VX2, VXB
+    xvfmul.d VX3, VX3, VXB
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+#else
+    xvfmul.s VX2, VX2, VXB
+    xvst VX2, Y, 0 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L213
+    b .L997
+    .align 3
+
+.L214: // ALPHA==0 BETA==0
+    xvst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvst VXZ, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L214
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    CMPEQ $fcc0, ALPHA, a1
+    bcnez $fcc0, .L220
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
+    b .L221 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L220:
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L224 // ALPHA==0 BETA==0
+    b .L223 // ALPHA==0 BETA!=0
+    .align 3
+
+.L221: // ALPHA!=0 BETA!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX0, VX0, VXA
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    xvfmadd.d VX2, VX2, VXB, VX0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX1, VX1, VXA
+    xvfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    xvstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX0, VX0, VXA
+    xvfmadd.s VX2, VX2, VXB, VX0
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // ALPHA!=0 BETA==0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VX0, VX0, VXA
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvstelm.d VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    xvfmul.d VX1, VX1, VXA
+    addi.d  I, I, -1
+    xvstelm.d VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX2, VX2, VXB
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    xvstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvfmul.d VX3, VX3, VXB
+    addi.d  I, I, -1
+    xvstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX2, VX2, VXB
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // ALPHA==0 BETA==0
+#ifdef DOUBLE
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+#else
+    xvstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   $f12, X, 0 * SIZE
+    LD   $f13, Y, 0 * SIZE
+    addi.d I, I, -1
+    MUL  $f12, $f12, ALPHA
+    MADD $f13, $f13, BETA, $f12
+    ST   $f13, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/axpby_lsx.S b/kernel/loongarch64/axpby_lsx.S
new file mode 100644
index 000000000..45154c262
--- /dev/null
+++ b/kernel/loongarch64/axpby_lsx.S
@@ -0,0 +1,1148 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define ALPHA  $f0
+#define X      $r5
+#define INCX   $r6
+#define BETA   $f1
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VXA    $vr23
+#define VXB    $vr9
+#define VXZ    $vr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.s.l a1, a1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    MTG  t1, ALPHA
+    MTG  t2, BETA
+    MTG  t3, a1
+#ifdef DOUBLE
+    vreplgr2vr.d VXA, t1
+    vreplgr2vr.d VXB, t2
+    vreplgr2vr.d VXZ, t3
+#else
+    vreplgr2vr.w VXA, t1
+    vreplgr2vr.w VXB, t2
+    vreplgr2vr.w VXZ, t3
+#endif
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ $fcc0, ALPHA, a1
+    bcnez $fcc0, .L110
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
+    b .L111 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L110:
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L114 // ALPHA==0 BETA==0
+    b .L113 // ALPHA==0 BETA!=0
+    .align 3
+
+.L111: // ALPHA!=0 BETA!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vfmul.d VX0, VX0, VXA
+    vfmul.d VX1, VX1, VXA
+    vfmadd.d VX2, VX2, VXB, VX0
+    vfmadd.d VX3, VX3, VXB, VX1
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vfmul.d VX0, VX0, VXA
+    vfmul.d VX1, VX1, VXA
+    vfmadd.d VX2, VX2, VXB, VX0
+    vfmadd.d VX3, VX3, VXB, VX1
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmul.s VX0, VX0, VXA
+    vfmul.s VX1, VX1, VXA
+    vfmadd.s VX2, VX2, VXB, VX0
+    vfmadd.s VX3, VX3, VXB, VX1
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // ALPHA!=0 BETA==0
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VX0, VX0, VXA
+    vfmul.d VX1, VX1, VXA
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    vld VX2, X, 4 * SIZE
+    vld VX3, X, 6 * SIZE
+    vfmul.d VX2, VX2, VXA
+    vfmul.d VX3, VX3, VXA
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX0, VX0, VXA
+    vfmul.s VX1, VX1, VXA
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
+    vld VX0, Y, 0 * SIZE
+    vld VX1, Y, 2 * SIZE
+    vfmul.d VX0, VX0, VXB
+    vfmul.d VX1, VX1, VXB
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vfmul.d VX2, VX2, VXB
+    vfmul.d VX3, VX3, VXB
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmul.s VX2, VX2, VXB
+    vfmul.s VX3, VX3, VXB
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // ALPHA==0 BETA==0
+    vst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    vst VXZ, Y, 2 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    vst VXZ, Y, 6 * SIZE
+#else
+    vst VXZ, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    CMPEQ $fcc0, ALPHA, a1
+    bcnez $fcc0, .L120
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
+    b .L121 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L120:
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L124 // ALPHA==0 BETA==0
+    b .L123 // ALPHA==0 BETA!=0
+    .align 3
+
+.L121: // ALPHA!=0 BETA!=0
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    vfmul.d VX0, VX0, VXA
+    vld VX1, X, 2 * SIZE
+    vfmadd.d VX2, VX2, VXB, VX0
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX1, VX1, VXA
+    vld VX0, X, 4 * SIZE
+    vfmadd.d VX3, VX3, VXB, VX1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX0, VX0, VXA
+    vld VX1, X, 6 * SIZE
+    vfmadd.d VX2, VX2, VXB, VX0
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX1, VX1, VXA
+    vfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, VX0, VXA
+    vld VX1, X, 4 * SIZE
+    vfmadd.s VX2, VX2, VXB, VX0
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX1, VX1, VXA
+    vfmadd.s VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L122: // ALPHA!=0 BETA==0
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VX0, VX0, VXA
+    vfmul.d VX1, VX1, VXA
+    vstelm.d VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfmul.d VX0, VX0, VXA
+    vfmul.d VX1, VX1, VXA
+    vstelm.d VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+#else
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX0, VX0, VXA
+    vfmul.s VX1, VX1, VXA
+    vstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L122
+    b .L997
+    .align 3
+
+.L123: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VX2, VX2, VXB
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX3, VX3, VXB
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX2, VX2, VXB
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX3, VX3, VXB
+    addi.d  I, I, -1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX2, VX2, VXB
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX3, VX3, VXB
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L123
+    b .L997
+    .align 3
+
+.L124: // ALPHA==0 BETA==0
+#ifdef DOUBLE
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+#else
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L124
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    CMPEQ $fcc0, ALPHA, a1
+    bcnez $fcc0, .L210
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
+    b .L211 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L210:
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L214 // ALPHA==0 BETA==0
+    b .L213 // ALPHA==0 BETA!=0
+    .align 3
+
+.L211: // ALPHA!=0 BETA!=0
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX0, VXA, VX0
+    vld VX3, Y, 2 * SIZE
+    vfmadd.d VX2, VX2, VXB, VX0
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX2, Y, 0 * SIZE
+    vfmul.d VX1, VXA, VX1
+    vld VX2, Y, 4 * SIZE
+    vfmadd.d VX3, VX3, VXB, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vst VX3, Y, 2 * SIZE
+    vfmul.d VX0, VX0, VXA
+    vld VX3, Y, 6 * SIZE
+    vfmadd.d VX2, VX2, VXB, VX0
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX2, Y, 4 * SIZE
+    vfmul.d VX1, VX1, VXA
+    vfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vst VX3, Y, 6 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmul.s VX0, VXA, VX0
+    vld VX3, Y, 4 * SIZE
+    vfmadd.s VX2, VX2, VXB, VX0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vst VX2, Y, 0 * SIZE
+    vfmul.s VX1, VX1, VXA
+    vfmadd.s VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L212: // ALPHA!=0 BETA==0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX0, VXA, VX0
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 0 * SIZE
+    vfmul.d VX1, VXA, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vst VX1, Y, 2 * SIZE
+    vfmul.d VX0, VX0, VXA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 4 * SIZE
+    vfmul.d VX1, VX1, VXA
+    addi.d  I, I, -1
+    vst VX1, Y, 6 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmul.s VX0, VXA, VX0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vst VX0, Y, 0 * SIZE
+    vfmul.s VX1, VX1, VXA
+    addi.d  I, I, -1
+    vst VX1, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L212
+    b .L997
+    .align 3
+
+.L213: // ALPHA==0 BETA!=0
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX3, Y, 2 * SIZE
+    vfmul.d VX2, VX2, VXB
+    vfmul.d VX3, VX3, VXB
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vfmul.d VX2, VX2, VXB
+    vfmul.d VX3, VX3, VXB
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+#else
+    vld VX3, Y, 4 * SIZE
+    vfmul.s VX2, VX2, VXB
+    vfmul.s VX3, VX3, VXB
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L213
+    b .L997
+    .align 3
+
+.L214: // ALPHA==0 BETA==0
+    vst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    vst VXZ, Y, 2 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    vst VXZ, Y, 6 * SIZE
+#else
+    vst VXZ, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L214
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    CMPEQ $fcc0, ALPHA, a1
+    bcnez $fcc0, .L220
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
+    b .L221 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L220:
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L224 // ALPHA==0 BETA==0
+    b .L223 // ALPHA==0 BETA!=0
+    .align 3
+
+.L221: // ALPHA!=0 BETA!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, VX0, VXA
+    vfmadd.d VX2, VX2, VXB, VX0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX1, VX1, VXA
+    vfmadd.d VX3, VX3, VXB, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, VX0, VXA
+    vfmadd.d VX2, VX2, VXB, VX0
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VX1, VX1, VXA
+    vfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, VX0, VXA
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vfmadd.s VX2, VX2, VXB, VX0
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX1, VX1, VXA
+    addi.d  I, I, -1
+    vfmadd.s VX3, VX3, VXB, VX1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // ALPHA!=0 BETA==0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX0, VX0, VXA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX1, VX1, VXA
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vstelm.d VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX0, VX0, VXA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX1, VX1, VXA
+    addi.d  I, I, -1
+    vstelm.d VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmul.s VX0, VX0, VXA
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX1, VX1, VXA
+    addi.d  I, I, -1
+    vstelm.w VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VX2, VX2, VXB
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX3, VX3, VXB
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX2, VX2, VXB
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX3, VX3, VXB
+    addi.d  I, I, -1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX2, VX2, VXB
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX3, VX3, VXB
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // ALPHA==0 BETA==0
+#ifdef DOUBLE
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+#else
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   $f12, X, 0 * SIZE
+    LD   $f13, Y, 0 * SIZE
+    addi.d I, I, -1
+    MUL  $f12, $f12, ALPHA
+    MADD $f13, $f13, BETA, $f12
+    ST   $f13, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/axpy_lasx.S b/kernel/loongarch64/axpy_lasx.S
new file mode 100644
index 000000000..707fd09b5
--- /dev/null
+++ b/kernel/loongarch64/axpy_lasx.S
@@ -0,0 +1,529 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define XX     $r5
+#define YY     $r6
+#define ALPHA  $f0
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VXA    $xr23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT   a1,  a1
+    movgr2fr.d a2, TEMP
+    FFINT   a2,  a2
+    CMPEQ   $fcc0, ALPHA, a1
+    bcnez $fcc0, .L999
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    MTG t1, ALPHA
+#ifdef DOUBLE
+    xvreplgr2vr.d VXA, t1
+#else
+    xvreplgr2vr.w VXA, t1
+#endif
+
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L113
+    CMPEQ $fcc0, ALPHA, a2
+    bceqz $fcc0, .L112
+    .align 3
+
+.L111:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvfadd.d VX2, VX0, VX2
+    xvfadd.d VX3, VX1, VX3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    addi.d  I, I, -1
+    xvfadd.s VX2, VX0, VX2
+    xvst VX2, Y, 0 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    b .L113
+    .align 3
+
+.L112:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvfmadd.d VX2, VX0, VXA, VX2
+    xvfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    addi.d  I, I, -1
+    xvfmadd.s VX2, VX0, VXA, VX2
+    xvst VX2, Y, 0 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L112
+    .align 3
+
+.L113:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L114:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L114
+    b .L999
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    move YY, Y
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmadd.d VX2, VX0, VXA, VX2
+    xvld VX1, X, 4 * SIZE
+    xvstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    xvstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 3
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmadd.s VX2, VX0, VXA, VX2
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
+    addi.d  X, X, SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmadd.d VX2, VX0, VXA, VX2
+    xvld VX3, Y, 4 * SIZE
+    xvst VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    xvst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfmadd.s VX2, VX0, VXA, VX2
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
+    add.d X, X, INCX
+    addi.d Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    move YY, Y
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmadd.d VX2, VX0, VXA, VX2
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    xvstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmadd.s VX2, VX0, VXA, VX2
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/axpy_lsx.S b/kernel/loongarch64/axpy_lsx.S
new file mode 100644
index 000000000..0d74e2bce
--- /dev/null
+++ b/kernel/loongarch64/axpy_lsx.S
@@ -0,0 +1,573 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define XX     $r5
+#define YY     $r6
+#define ALPHA  $f0
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VXA    $vr23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT   a1,  a1
+    movgr2fr.d a2, TEMP
+    FFINT   a2,  a2
+    CMPEQ   $fcc0, ALPHA, a1
+    bcnez $fcc0, .L999
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    MTG t1, ALPHA
+#ifdef DOUBLE
+    vreplgr2vr.d VXA, t1
+#else
+    vreplgr2vr.w VXA, t1
+#endif
+
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L113
+    CMPEQ $fcc0, ALPHA, a2
+    bceqz $fcc0, .L112
+    .align 3
+
+.L111:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vfadd.d VX2, VX0, VX2
+    vfadd.d VX3, VX1, VX3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vfadd.d VX2, VX0, VX2
+    vfadd.d VX3, VX1, VX3
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfadd.s VX2, VX0, VX2
+    vfadd.s VX3, VX1, VX3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L113
+    .align 3
+
+.L112:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vfmadd.d VX2, VX0, VXA, VX2
+    vfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    vfmadd.d VX2, VX0, VXA, VX2
+    vfmadd.d VX3, VX1, VXA, VX3
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmadd.s VX2, VX0, VXA, VX2
+    vfmadd.s VX3, VX1, VXA, VX3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+#endif
+    blt $r0, I, .L112
+    .align 3
+
+.L113:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L114:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L114
+    b .L999
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    move YY, Y
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX2, VX0, VXA, VX2
+    vld VX1, X, 2 * SIZE
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX3, VX1, VXA, VX3
+    vld VX0, X, 4 * SIZE
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX2, VX0, VXA, VX2
+    vld VX1, X, 6 * SIZE
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX3, VX1, VXA, VX3
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX2, VX0, VXA, VX2
+    vld VX1, X, 4 * SIZE
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+#endif
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
+    addi.d  X, X, SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vfmadd.d VX2, VX0, VXA, VX2
+    vld VX3, Y, 2 * SIZE
+    vst VX2, Y, 0 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmadd.d VX3, VX1, VXA, VX3
+    vld VX2, Y, 4 * SIZE
+    vst VX3, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vfmadd.d VX2, VX0, VXA, VX2
+    vld VX3, Y, 6 * SIZE
+    vst VX2, Y, 4 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vst VX3, Y, 6 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmadd.s VX2, VX0, VXA, VX2
+    vld VX3, Y, 4 * SIZE
+    vst VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    vfmadd.s VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
+    add.d X, X, INCX
+    addi.d Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    move YY, Y
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX2, VX0, VXA, VX2
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX3, VX1, VXA, VX3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX2, VX0, VXA, VX2
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX2, VX0, VXA, VX2
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/camax_lasx.S b/kernel/loongarch64/camax_lasx.S
new file mode 100644
index 000000000..7013430cb
--- /dev/null
+++ b/kernel/loongarch64/camax_lasx.S
@@ -0,0 +1,194 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N    $r4
+#define X    $r5
+#define INCX $r6
+#define I    $r12
+#define t1   $f14
+#define t2   $f18
+#define t3   $f15
+#define t4   $f17
+#define s1   $f22
+#define s2   $f9
+#define s3   $f10
+#define s4   $f11
+#define TEMP $r16
+#define a0   $f20
+#define a1   $f21
+#define x1   $xr9
+#define x2   $xr10
+#define x3   $xr11
+#define x4   $xr12
+#define VT0  $xr13
+#define VT1  $xr14
+#define res0 $xr18
+#define neg1 $xr19
+#define VX0  $xr20
+#define VX1  $xr21
+#define VM0  $xr22
+#define VM1  $xr23
+
+    PROLOGUE
+    xvxor.v VM0, VM0, VM0
+    xvxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    li.w I, -1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    xvreplgr2vr.w neg1, I
+    xvffint.s.w neg1, neg1
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L23
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    addi.d I, I, -1
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s x3, neg1, x1
+    xvfmul.s x4, neg1, x2
+    xvfcmp.clt.s VT0, x1, res0
+    xvfcmp.clt.s VT1, x2, res0
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VT1
+    addi.d X, X, 16 * SIZE
+    xvfadd.s VM1, x1, x2
+    xvfmax.s VM0, VM0, VM1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfmax.s VM1, x1, x2
+    xvfmax.s VM0, x3, x4
+    xvfmax.s VM0, VM0, VM1
+    b .L23
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmax.s s1, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmax.s s1, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    addi.d I, I, -1
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmax.s s3, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmax.s s4, t1, t3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    fmax.s s1, s1, s2
+    fmax.s s3, s3, s4
+    fmax.s s1, s1, s3
+    .align 3
+
+.L23: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    add.d  X, X, INCX
+    fmax.s s1, a0, s1
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/camax_lsx.S b/kernel/loongarch64/camax_lsx.S
new file mode 100644
index 000000000..2e55629de
--- /dev/null
+++ b/kernel/loongarch64/camax_lsx.S
@@ -0,0 +1,206 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N    $r4
+#define X    $r5
+#define INCX $r6
+#define I    $r12
+#define t1   $f14
+#define t2   $f18
+#define t3   $f15
+#define t4   $f17
+#define s1   $f22
+#define s2   $f9
+#define s3   $f10
+#define s4   $f11
+#define TEMP $r16
+#define a0   $f20
+#define a1   $f21
+#define x1   $vr9
+#define x2   $vr10
+#define x3   $vr11
+#define x4   $vr12
+#define VT0  $vr13
+#define VT1  $vr14
+#define res0 $vr18
+#define neg1 $vr19
+#define VX0  $vr20
+#define VX1  $vr21
+#define VM0  $vr22
+#define VM1  $vr23
+
+    PROLOGUE
+    vxor.v VM0, VM0, VM0
+    vxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    li.w I, -1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    vreplgr2vr.w neg1, I
+    vffint.s.w neg1, neg1
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L23
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, neg1, x1
+    vfmul.s x4, neg1, x2
+    vfcmp.clt.s VT0, x1, res0
+    vfcmp.clt.s VT1, x2, res0
+    vld VX0, X, 8 * SIZE
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT1
+    vld VX1, X, 12 * SIZE
+    vfadd.s VM1, x1, x2
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, neg1, x1
+    vfmul.s x4, neg1, x2
+    vfcmp.clt.s VT0, x1, res0
+    vfcmp.clt.s VT1, x2, res0
+    addi.d X, X, 16 * SIZE
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT1
+    vfadd.s x1, x1, x2
+    vfmax.s VM1, x1, VM1
+    vfmax.s VM0, VM0, VM1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmax.s VM1, x1, x2
+    vfmax.s VM0, x3, x4
+    vfmax.s VM0, VM0, VM1
+    b .L23
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmax.s s1, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmax.s s1, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    addi.d I, I, -1
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmax.s s3, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmax.s s4, t1, t3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    fmax.s s1, s1, s2
+    fmax.s s3, s3, s4
+    fmax.s s1, s1, s3
+    .align 3
+
+.L23: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    fld.s a0, X, 0 * SIZE
+    fld.s a1, X, 1 * SIZE
+    addi.d I, I, -1
+    fabs.s a0, a0
+    fabs.s a1, a1
+    fadd.s a0, a0, a1
+    add.d  X, X, INCX
+    fmax.s s1, a0, s1
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/camin_lasx.S b/kernel/loongarch64/camin_lasx.S
new file mode 100644
index 000000000..d7931d30a
--- /dev/null
+++ b/kernel/loongarch64/camin_lasx.S
@@ -0,0 +1,199 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N    $r4
+#define X    $r5
+#define INCX $r6
+#define I    $r12
+#define TEMP $r16
+#define t1   $f14
+#define t2   $f18
+#define t3   $f15
+#define t4   $f17
+#define s1   $f22
+#define s2   $f9
+#define s3   $f10
+#define s4   $f11
+#define a0   $f20
+#define a1   $f21
+#define x1   $xr9
+#define x2   $xr10
+#define x3   $xr11
+#define x4   $xr12
+#define VT0  $xr13
+#define VT1  $xr14
+#define res0 $xr18
+#define neg1 $xr19
+#define VX0  $xr20
+#define VX1  $xr21
+#define VM0  $xr22
+#define VM1  $xr23
+
+    PROLOGUE
+    MTC s1, $r0
+    xvxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    fld.s a0, X, 0 * SIZE
+    fld.s a1, X, 1 * SIZE
+    fabs.s a0, a0
+    fabs.s a1, a1
+    fadd.s s1, a1, a0
+    xvreplve0.w VM0, VM0
+    li.d TEMP, 1
+    li.w I, -1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    xvreplgr2vr.w neg1, I
+    xvffint.s.w neg1, neg1
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L23
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    addi.d I, I, -1
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s x3, neg1, x1
+    xvfmul.s x4, neg1, x2
+    xvfcmp.clt.s VT0, x1, res0
+    xvfcmp.clt.s VT1, x2, res0
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VT1
+    addi.d X, X, 16 * SIZE
+    xvfadd.s VM1, x1, x2
+    xvfmin.s VM0, VM0, VM1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfmin.s VM1, x1, x2
+    xvfmin.s VM0, x3, x4
+    xvfmin.s VM0, VM0, VM1
+    b .L23
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmin.s s1, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmin.s s1, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    addi.d I, I, -1
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmin.s s3, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmin.s s4, t1, t3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    fmin.s s1, s1, s2
+    fmin.s s3, s3, s4
+    fmin.s s1, s1, s3
+    .align 3
+
+.L23: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    add.d  X, X, INCX
+    fmin.s s1, a0, s1
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/camin_lsx.S b/kernel/loongarch64/camin_lsx.S
new file mode 100644
index 000000000..e9ad6b04d
--- /dev/null
+++ b/kernel/loongarch64/camin_lsx.S
@@ -0,0 +1,211 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N    $r4
+#define X    $r5
+#define INCX $r6
+#define I    $r12
+#define t1   $f14
+#define t2   $f18
+#define t3   $f15
+#define t4   $f17
+#define s1   $f22
+#define s2   $f9
+#define s3   $f10
+#define s4   $f11
+#define TEMP $r16
+#define a0   $f20
+#define a1   $f21
+#define x1   $vr9
+#define x2   $vr10
+#define x3   $vr11
+#define x4   $vr12
+#define VT0  $vr13
+#define VT1  $vr14
+#define res0 $vr18
+#define neg1 $vr19
+#define VX0  $vr20
+#define VX1  $vr21
+#define VM0  $vr22
+#define VM1  $vr23
+
+    PROLOGUE
+    MTC s1, $r0
+    vxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    fld.s a0, X, 0 * SIZE
+    fld.s a1, X, 1 * SIZE
+    fabs.s a0, a0
+    fabs.s a1, a1
+    fadd.s s1, a1, a0
+    vreplvei.w VM0, VM0, 0
+    li.d TEMP, 1
+    li.w I, -1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    vreplgr2vr.w neg1, I
+    vffint.s.w neg1, neg1
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L23
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, neg1, x1
+    vfmul.s x4, neg1, x2
+    vfcmp.clt.s VT0, x1, res0
+    vfcmp.clt.s VT1, x2, res0
+    vld VX0, X, 8 * SIZE
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT1
+    vld VX1, X, 12 * SIZE
+    vfadd.s VM1, x1, x2
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, neg1, x1
+    vfmul.s x4, neg1, x2
+    vfcmp.clt.s VT0, x1, res0
+    vfcmp.clt.s VT1, x2, res0
+    addi.d X, X, 16 * SIZE
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT1
+    vfadd.s x1, x1, x2
+    vfmin.s VM1, x1, VM1
+    vfmin.s VM0, VM0, VM1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmin.s VM1, x1, x2
+    vfmin.s VM0, x3, x4
+    vfmin.s VM0, VM0, VM1
+    b .L23
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmin.s s1, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmin.s s1, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    addi.d I, I, -1
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmin.s s3, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmin.s s4, t1, t3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    fmin.s s1, s1, s2
+    fmin.s s3, s3, s4
+    fmin.s s1, s1, s3
+    .align 3
+
+.L23: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    fld.s a0, X, 0 * SIZE
+    fld.s a1, X, 1 * SIZE
+    addi.d I, I, -1
+    fabs.s a0, a0
+    fabs.s a1, a1
+    fadd.s a0, a0, a1
+    add.d  X, X, INCX
+    fmin.s s1, a0, s1
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/casum_lasx.S b/kernel/loongarch64/casum_lasx.S
new file mode 100644
index 000000000..caf0ff969
--- /dev/null
+++ b/kernel/loongarch64/casum_lasx.S
@@ -0,0 +1,329 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define res1   $xr16
+#define res2   $xr17
+#define res3   $xr18
+#define res0   $xr19
+#define neg1   $xr20
+#define VT0    $xr21
+#define VT1    $xr22
+
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    xvxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+#ifdef DOUBLE
+    li.d t1, -1
+    xvreplgr2vr.d neg1, t1
+    xvffint.d.l neg1, neg1
+#else
+    li.w t1, -1
+    xvreplgr2vr.w neg1, t1
+    xvffint.s.w neg1, neg1
+#endif
+    li.d  TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+    xvld VX2, X, 8 * SIZE
+    xvld VX3, X, 12 * SIZE
+    xvfmul.d VX0, neg1, VX2
+    xvfmul.d VX1, neg1, VX3
+    xvfcmp.clt.d VT0, VX2, res0
+    xvfcmp.clt.d VT1, VX3, res0
+    xvbitsel.v VX2, VX2, VX0, VT0
+    xvbitsel.v VX3, VX3, VX1, VT1
+    xvfadd.d res2, VX2, VX3
+    xvfadd.d res1, res1, res2
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    xvfmul.s VX2, neg1, VX0
+    xvfmul.s VX3, neg1, VX1
+    xvfcmp.clt.s VT0, VX0, res0
+    xvfcmp.clt.s VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.s res2, VX0, VX1
+    xvfadd.s res1, res2, res1
+#endif
+    addi.d X, X, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    FABS a1, a1
+    FABS a2, a2
+    addi.d I, I, -1
+    ADD  a1, a1, a2
+    ADD  s1, a1, s1
+    addi.d  X, X, 2 * SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 0
+    xvinsgr2vr.w VX1, t2, 1
+    xvinsgr2vr.w VX1, t3, 2
+    xvinsgr2vr.w VX1, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 4
+    xvinsgr2vr.w VX1, t2, 5
+    xvinsgr2vr.w VX1, t3, 6
+    xvinsgr2vr.w VX1, t4, 7
+    xvfmul.s VX2, neg1, VX0
+    xvfmul.s VX3, neg1, VX1
+    xvfcmp.clt.s VT0, VX0, res0
+    xvfcmp.clt.s VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.s res2, VX0, VX1
+    xvfadd.s res1, res2, res1
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    FABS a1, a1
+    FABS a2, a2
+    addi.d I, I, -1
+    ADD  a1, a1, a2
+    ADD  s1, a1, s1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV  $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/casum_lsx.S b/kernel/loongarch64/casum_lsx.S
new file mode 100644
index 000000000..4822f2080
--- /dev/null
+++ b/kernel/loongarch64/casum_lsx.S
@@ -0,0 +1,358 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define res1   $vr16
+#define res2   $vr17
+#define res3   $vr18
+#define res0   $vr19
+#define neg1   $vr20
+#define VT0    $vr21
+#define VT1    $vr22
+
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    vxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+#ifdef DOUBLE
+    li.d t1, -1
+    vreplgr2vr.d neg1, t1
+    vffint.d.l neg1, neg1
+#else
+    li.w t1, -1
+    vreplgr2vr.w neg1, t1
+    vffint.s.w neg1, neg1
+#endif
+    li.d  TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX2, X, 4 * SIZE
+    vld VX3, X, 6 * SIZE
+    vfmul.d VX0, neg1, VX2
+    vfmul.d VX1, neg1, VX3
+    vfcmp.clt.d VT0, VX2, res0
+    vfcmp.clt.d VT1, VX3, res0
+    vbitsel.v VX2, VX2, VX0, VT0
+    vbitsel.v VX3, VX3, VX1, VT1
+    vfadd.d res2, VX2, VX3
+    vfadd.d res1, res1, res2
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 10 * SIZE
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX2, X, 12 * SIZE
+    vld VX3, X, 14 * SIZE
+    vfmul.d VX0, neg1, VX2
+    vfmul.d VX1, neg1, VX3
+    vfcmp.clt.d VT0, VX2, res0
+    vfcmp.clt.d VT1, VX3, res0
+    vbitsel.v VX2, VX2, VX0, VT0
+    vbitsel.v VX3, VX3, VX1, VT1
+    vfadd.d res2, VX2, VX3
+    vfadd.d res1, res1, res2
+    addi.d  I, I, -1
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res2, VX0, VX1
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 12 * SIZE
+    addi.d  I, I, -1
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res3, VX1, VX0
+    vfadd.s res2, res3, res2
+    vfadd.s res1, res1, res2
+#endif
+    addi.d X, X, 16 * SIZE
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    FABS a1, a1
+    FABS a2, a2
+    addi.d I, I, -1
+    ADD  a1, a1, a2
+    ADD  s1, a1, s1
+    addi.d  X, X, 2 * SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res2, VX0, VX1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vfmul.s VX0, neg1, VX2
+    vfmul.s VX1, neg1, VX3
+    vfcmp.clt.s VT0, VX2, res0
+    vfcmp.clt.s VT1, VX3, res0
+    vbitsel.v VX2, VX2, VX0, VT0
+    vbitsel.v VX3, VX3, VX1, VT1
+    vfadd.s res3, VX2, VX3
+    vfadd.s res2, res3, res2
+    vfadd.s res1, res1, res2
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    FABS a1, a1
+    FABS a2, a2
+    addi.d I, I, -1
+    ADD  a1, a1, a2
+    ADD  s1, a1, s1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV  $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/caxpy_lasx.S b/kernel/loongarch64/caxpy_lasx.S
new file mode 100644
index 000000000..2b970fe70
--- /dev/null
+++ b/kernel/loongarch64/caxpy_lasx.S
@@ -0,0 +1,707 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define XX     $r5
+#define YY     $r6
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VXAR   $xr23
+#define VXAI   $xr19
+#define x1     $xr18
+#define x2     $xr17
+#define x3     $xr16
+#define x4     $xr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT  a1, a1
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L10
+    bcnez $fcc1, .L999
+.L10:
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    MTG  t1, ALPHAR
+    MTG  t2, ALPHAI
+#ifdef DOUBLE
+    xvreplgr2vr.d VXAR, t1
+    xvreplgr2vr.d VXAI, t2
+    srai.d I, N, 2
+#else
+    xvreplgr2vr.w VXAR, t1
+    xvreplgr2vr.w VXAI, t2
+    srai.d I, N, 3
+#endif
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    .align 3
+
+.L111:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmsub.d VX1, VXAR, x1, VX0
+    xvfmadd.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfadd.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmsub.s VX1, VXAR, x1, VX0
+    xvfmadd.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmadd.d VX1, VXAR, x1, VX0
+    xvfmsub.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfsub.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmadd.s VX1, VXAR, x1, VX0
+    xvfmsub.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 2
+    xvinsgr2vr.d x4, t4, 2
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 1
+    xvinsgr2vr.d x4, t2, 1
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    xvld VX1, X, 8 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmsub.d VX1, VXAR, x1, VX0
+    xvfmadd.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfadd.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmsub.s VX1, VXAR, x1, VX0
+    xvfmadd.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmadd.d VX1, VXAR, x1, VX0
+    xvfmsub.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfsub.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmadd.s VX1, VXAR, x1, VX0
+    xvfmsub.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+#else
+    addi.d  I, I, -1
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+    add.d YY, YY, INCY
+    addi.d X, X, 16 * SIZE
+#endif
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 2
+    xvinsgr2vr.d x2, t4, 2
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 1
+    xvinsgr2vr.d x2, t2, 1
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    xvld VX3, Y, 8 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmsub.d VX1, VXAR, x1, VX0
+    xvfmadd.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfadd.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmsub.s VX1, VXAR, x1, VX0
+    xvfmadd.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmadd.d VX1, VXAR, x1, VX0
+    xvfmsub.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfsub.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmadd.s VX1, VXAR, x1, VX0
+    xvfmsub.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmsub.d VX1, VXAR, x1, VX0
+    xvfmadd.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfadd.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmsub.s VX1, VXAR, x1, VX0
+    xvfmadd.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmadd.d VX1, VXAR, x1, VX0
+    xvfmsub.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfsub.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmadd.s VX1, VXAR, x1, VX0
+    xvfmsub.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfsub.s x4, x4, VX3
+#endif
+#endif
+    addi.d  I, I, -1
+#ifdef DOUBLE
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+#else
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 3
+#else
+    andi I, N, 7
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    LD  a3, Y, 0 * SIZE
+    LD  a4, Y, 1 * SIZE
+    addi.d I, I, -1
+#if !defined(CONJ)
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MSUB s3, ALPHAR, a1, s1
+    MADD s4, a2, ALPHAR, s2
+    ADD  s3, s3, a3
+    ADD  s4, s4, a4
+#else
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MADD s3, ALPHAR, a1, s1
+    MSUB s4, a2, ALPHAR, s2
+    ADD  s3, s3, a3
+    SUB  s4, a4, s4
+#endif
+    ST  s3, Y, 0 * SIZE
+    ST  s4, Y, 1 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/caxpy_lsx.S b/kernel/loongarch64/caxpy_lsx.S
new file mode 100644
index 000000000..85598d0b9
--- /dev/null
+++ b/kernel/loongarch64/caxpy_lsx.S
@@ -0,0 +1,679 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define XX     $r5
+#define YY     $r6
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VXAR   $vr23
+#define VXAI   $vr19
+#define x1     $vr18
+#define x2     $vr17
+#define x3     $vr16
+#define x4     $vr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT  a1, a1
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L10
+    bcnez $fcc1, .L999
+.L10:
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    MTG  t1, ALPHAR
+    MTG  t2, ALPHAI
+#ifdef DOUBLE
+    vreplgr2vr.d VXAR, t1
+    vreplgr2vr.d VXAI, t2
+#else
+    vreplgr2vr.w VXAR, t1
+    vreplgr2vr.w VXAI, t2
+#endif
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    .align 3
+
+.L111:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmsub.s VX1, VXAR, x1, VX0
+    vfmadd.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmadd.s VX1, VXAR, x1, VX0
+    vfmsub.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+#if !defined(CONJ)
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#endif
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+#else
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmsub.s VX1, VXAR, x1, VX0
+    vfmadd.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmadd.s VX1, VXAR, x1, VX0
+    vfmsub.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#if !defined(CONJ)
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#endif
+    addi.d  I, I, -1
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+#else
+    addi.d  I, I, -1
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmsub.s VX1, VXAR, x1, VX0
+    vfmadd.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmadd.s VX1, VXAR, x1, VX0
+    vfmsub.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+#if !defined(CONJ)
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#endif
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    addi.d  I, I, -1
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+#else
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    addi.d  I, I, -1
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+#endif
+    add.d Y, Y, INCY
+#if !defined(CONJ)
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmsub.s VX1, VXAR, x1, VX0
+    vfmadd.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmadd.s VX1, VXAR, x1, VX0
+    vfmsub.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+#if !defined(CONJ)
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#endif
+    addi.d  I, I, -1
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+#else
+    addi.d  I, I, -1
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    .align 3
+
+.L997:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    LD  a3, Y, 0 * SIZE
+    LD  a4, Y, 1 * SIZE
+    addi.d I, I, -1
+#if !defined(CONJ)
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MSUB s3, ALPHAR, a1, s1
+    MADD s4, a2, ALPHAR, s2
+    ADD  s3, s3, a3
+    ADD  s4, s4, a4
+#else
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MADD s3, ALPHAR, a1, s1
+    MSUB s4, a2, ALPHAR, s2
+    ADD  s3, s3, a3
+    SUB  s4, a4, s4
+#endif
+    ST  s3, Y, 0 * SIZE
+    ST  s4, Y, 1 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/ccopy_lasx.S b/kernel/loongarch64/ccopy_lasx.S
new file mode 100644
index 000000000..fbc5d96bc
--- /dev/null
+++ b/kernel/loongarch64/ccopy_lasx.S
@@ -0,0 +1,386 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, X, 8 * SIZE
+    xvld VX3, X, 12 * SIZE
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 4 * SIZE
+    xvst VX2, Y, 8 * SIZE
+    xvst VX3, Y, 12 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, X, 8 * SIZE
+    xvld VX3, X, 12 * SIZE
+    xvstelm.d VX0, Y, 0 * SIZE, 0
+    xvstelm.d VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0 * SIZE, 2
+    xvstelm.d VX0, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0 * SIZE, 0
+    xvstelm.d VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0 * SIZE, 2
+    xvstelm.d VX1, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.d VX2, Y, 0 * SIZE, 0
+    xvstelm.d VX2, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX2, Y, 0 * SIZE, 2
+    xvstelm.d VX2, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.d VX3, Y, 0 * SIZE, 0
+    xvstelm.d VX3, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX3, Y, 0 * SIZE, 2
+    xvstelm.d VX3, Y, 1 * SIZE, 3
+#else
+    xvld VX1, X, 8 * SIZE
+    xvstelm.w VX0, Y, 0 * SIZE, 0
+    xvstelm.w VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0 * SIZE, 2
+    xvstelm.w VX0, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0 * SIZE, 4
+    xvstelm.w VX0, Y, 1 * SIZE, 5
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0 * SIZE, 6
+    xvstelm.w VX0, Y, 1 * SIZE, 7
+    add.d Y, Y, INCY
+    xvstelm.w VX1, Y, 0 * SIZE, 0
+    xvstelm.w VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.w VX1, Y, 0 * SIZE, 2
+    xvstelm.w VX1, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.w VX1, Y, 0 * SIZE, 4
+    xvstelm.w VX1, Y, 1 * SIZE, 5
+    add.d Y, Y, INCY
+    xvstelm.w VX1, Y, 0 * SIZE, 6
+    xvstelm.w VX1, Y, 1 * SIZE, 7
+#endif
+    add.d Y, Y, INCY
+    addi.d X, X, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 4 * SIZE
+    xvst VX2, Y, 8 * SIZE
+    xvst VX3, Y, 12 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 0
+    xvinsgr2vr.w VX1, t2, 1
+    xvinsgr2vr.w VX1, t3, 2
+    xvinsgr2vr.w VX1, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 4
+    xvinsgr2vr.w VX1, t2, 5
+    xvinsgr2vr.w VX1, t3, 6
+    xvinsgr2vr.w VX1, t4, 7
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    addi.d Y, Y, 16 * SIZE
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  X, X, INCX
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/ccopy_lsx.S b/kernel/loongarch64/ccopy_lsx.S
new file mode 100644
index 000000000..4c4d880f1
--- /dev/null
+++ b/kernel/loongarch64/ccopy_lsx.S
@@ -0,0 +1,411 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:// INCX==1 and INCY==1
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX2, X, 4 * SIZE
+    vld VX3, X, 6 * SIZE
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 10 * SIZE
+    vld VX2, X, 12 * SIZE
+    vld VX3, X, 14 * SIZE
+    addi.d  I, I, -1
+    vst VX0, Y, 8 * SIZE
+    vst VX1, Y, 10 * SIZE
+    vst VX2, Y, 12 * SIZE
+    vst VX3, Y, 14 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX2, X, 8 * SIZE
+    vld VX3, X, 12 * SIZE
+    addi.d  I, I, -1
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 4 * SIZE
+    vst VX2, Y, 8 * SIZE
+    vst VX3, Y, 12 * SIZE
+#endif
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, 2 * SIZE
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    addi.d Y, Y, 2 * SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12:   // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX2, X, 4 * SIZE
+    vld VX3, X, 6 * SIZE
+    vstelm.d VX0, Y, 0 * SIZE, 0
+    vstelm.d VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0 * SIZE, 0
+    vstelm.d VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX2, Y, 0 * SIZE, 0
+    vstelm.d VX2, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX3, Y, 0 * SIZE, 0
+    vstelm.d VX3, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 10 * SIZE
+    vld VX2, X, 12 * SIZE
+    vld VX3, X, 14 * SIZE
+    vstelm.d VX0, Y, 0 * SIZE, 0
+    vstelm.d VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0 * SIZE, 0
+    vstelm.d VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX2, Y, 0 * SIZE, 0
+    vstelm.d VX2, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX3, Y, 0 * SIZE, 0
+    vstelm.d VX3, Y, 1 * SIZE, 1
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX2, X, 8 * SIZE
+    vld VX3, X, 12 * SIZE
+    vstelm.w VX0, Y, 0 * SIZE, 0
+    vstelm.w VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0 * SIZE, 2
+    vstelm.w VX0, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0 * SIZE, 0
+    vstelm.w VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0 * SIZE, 2
+    vstelm.w VX1, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    vstelm.w VX2, Y, 0 * SIZE, 0
+    vstelm.w VX2, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.w VX2, Y, 0 * SIZE, 2
+    vstelm.w VX2, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    vstelm.w VX3, Y, 0 * SIZE, 0
+    vstelm.w VX3, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.w VX3, Y, 0 * SIZE, 2
+    vstelm.w VX3, Y, 1 * SIZE, 3
+#endif
+    add.d Y, Y, INCY
+    addi.d X, X, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 4 * SIZE
+    vst VX1, Y, 6 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 8 * SIZE
+    vst VX1, Y, 10 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 12 * SIZE
+    vst VX1, Y, 14 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 4 * SIZE
+    vst VX2, Y, 8 * SIZE
+    vst VX3, Y, 12 * SIZE
+#endif
+    addi.d Y, Y, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  X, X, INCX
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cdot_lasx.S b/kernel/loongarch64/cdot_lasx.S
new file mode 100644
index 000000000..0583e56ea
--- /dev/null
+++ b/kernel/loongarch64/cdot_lasx.S
@@ -0,0 +1,565 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r19
+#define TEMP   $r10
+#define t1     $r11
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define res1   $xr16
+#define res2   $xr17
+#define res3   $xr18
+#define res4   $xr19
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define x1     $xr20
+#define x2     $xr21
+#define x3     $xr22
+#define x4     $xr23
+
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    xvxor.v res3, res3, res3
+    xvxor.v res4, res4, res4
+    bge $r0, N, .L999
+    li.d  TEMP, 2 * SIZE
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+#ifdef DOUBLE
+    srai.d I, N, 2
+#else
+    srai.d I, N, 3
+#endif
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    .align 3
+
+.L111:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmadd.d res1, x1, x3, res1
+    xvfmadd.d res2, x2, x3, res2
+    xvfmadd.d res3, x1, x4, res3
+    xvfmadd.d res4, x2, x4, res4
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    xvfmadd.s res1, x1, x3, res1
+    xvfmadd.s res2, x2, x3, res2
+    xvfmadd.s res3, x1, x4, res3
+    xvfmadd.s res4, x2, x4, res4
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L996
+    .align 3
+
+.L12:
+    bge $r0, I, .L997
+    .align 3
+
+.L121:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 2
+    xvinsgr2vr.d x4, t4, 2
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 1
+    xvinsgr2vr.d x4, t2, 1
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    addi.d X, X, 8 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmadd.d res1, x1, x3, res1
+    xvfmadd.d res2, x2, x3, res2
+    xvfmadd.d res3, x1, x4, res3
+    xvfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    xvld VX1, X, 8 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    addi.d X, X, 16 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmadd.s res1, x1, x3, res1
+    xvfmadd.s res2, x2, x3, res2
+    xvfmadd.s res3, x1, x4, res3
+    xvfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L21:
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 2
+    xvinsgr2vr.d x2, t4, 2
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 1
+    xvinsgr2vr.d x2, t2, 1
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    addi.d Y, Y, 8 * SIZE
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmadd.d res1, x1, x3, res1
+    xvfmadd.d res2, x2, x3, res2
+    xvfmadd.d res3, x1, x4, res3
+    xvfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    xvld VX3, Y, 8 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    addi.d Y, Y, 8 * SIZE
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    xvfmadd.s res1, x1, x3, res1
+    xvfmadd.s res2, x2, x3, res2
+    xvfmadd.s res3, x1, x4, res3
+    xvfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L996
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    xvfmadd.d res1, x1, x3, res1
+    xvfmadd.d res2, x2, x3, res2
+    xvfmadd.d res3, x1, x4, res3
+    xvfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    xvfmadd.s res1, x1, x3, res1
+    xvfmadd.s res2, x2, x3, res2
+    xvfmadd.s res3, x1, x4, res3
+    xvfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L996:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+    xvpickve.d VX1, res2, 1
+    xvpickve.d VX2, res2, 2
+    xvpickve.d VX3, res2, 3
+    xvfadd.d res2, VX1, res2
+    xvfadd.d res2, VX2, res2
+    xvfadd.d res2, VX3, res2
+    xvpickve.d VX1, res3, 1
+    xvpickve.d VX2, res3, 2
+    xvpickve.d VX3, res3, 3
+    xvfadd.d res3, VX1, res3
+    xvfadd.d res3, VX2, res3
+    xvfadd.d res3, VX3, res3
+    xvpickve.d VX1, res4, 1
+    xvpickve.d VX2, res4, 2
+    xvpickve.d VX3, res4, 3
+    xvfadd.d res4, VX1, res4
+    xvfadd.d res4, VX2, res4
+    xvfadd.d res4, VX3, res4
+#else
+    xvpickve.w VX0, res1, 1
+    xvpickve.w VX1, res1, 2
+    xvpickve.w VX2, res1, 3
+    xvpickve.w VX3, res1, 4
+    xvpickve.w x1, res1, 5
+    xvpickve.w x2, res1, 6
+    xvpickve.w x3, res1, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvfadd.s res1, x1, res1
+    xvfadd.s res1, x2, res1
+    xvfadd.s res1, x3, res1
+    xvpickve.w VX0, res2, 1
+    xvpickve.w VX1, res2, 2
+    xvpickve.w VX2, res2, 3
+    xvpickve.w VX3, res2, 4
+    xvpickve.w x1, res2, 5
+    xvpickve.w x2, res2, 6
+    xvpickve.w x3, res2, 7
+    xvfadd.s res2, VX0, res2
+    xvfadd.s res2, VX1, res2
+    xvfadd.s res2, VX2, res2
+    xvfadd.s res2, VX3, res2
+    xvfadd.s res2, x1, res2
+    xvfadd.s res2, x2, res2
+    xvfadd.s res2, x3, res2
+    xvpickve.w VX0, res3, 1
+    xvpickve.w VX1, res3, 2
+    xvpickve.w VX2, res3, 3
+    xvpickve.w VX3, res3, 4
+    xvpickve.w x1, res3, 5
+    xvpickve.w x2, res3, 6
+    xvpickve.w x3, res3, 7
+    xvfadd.s res3, VX0, res3
+    xvfadd.s res3, VX1, res3
+    xvfadd.s res3, VX2, res3
+    xvfadd.s res3, VX3, res3
+    xvfadd.s res3, x1, res3
+    xvfadd.s res3, x2, res3
+    xvfadd.s res3, x3, res3
+    xvpickve.w VX0, res4, 1
+    xvpickve.w VX1, res4, 2
+    xvpickve.w VX2, res4, 3
+    xvpickve.w VX3, res4, 4
+    xvpickve.w x1, res4, 5
+    xvpickve.w x2, res4, 6
+    xvpickve.w x3, res4, 7
+    xvfadd.s res4, VX0, res4
+    xvfadd.s res4, VX1, res4
+    xvfadd.s res4, VX2, res4
+    xvfadd.s res4, VX3, res4
+    xvfadd.s res4, x1, res4
+    xvfadd.s res4, x2, res4
+    xvfadd.s res4, x3, res4
+#endif
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 3
+#else
+    andi I, N, 7
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    LD   a3, Y, 0 * SIZE
+    LD   a4, Y, 1 * SIZE
+    MADD s1, a1, a3, s1
+    MADD s2, a2, a3, s2
+    MADD s3, a1, a4, s3
+    MADD s4, a2, a4, s4
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+#ifndef CONJ
+    SUB $f0, s1, s4
+    ADD $f1, s3, s2
+#else
+    ADD $f0, s1, s4
+    SUB $f1, s3, s2
+#endif
+    jirl    $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cdot_lsx.S b/kernel/loongarch64/cdot_lsx.S
new file mode 100644
index 000000000..5feea12be
--- /dev/null
+++ b/kernel/loongarch64/cdot_lsx.S
@@ -0,0 +1,397 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r19
+#define TEMP   $r10
+#define t1     $r11
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define res1   $vr16
+#define res2   $vr17
+#define res3   $vr18
+#define res4   $vr19
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define x1     $vr20
+#define x2     $vr21
+#define x3     $vr22
+#define x4     $vr23
+
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    vxor.v res3, res3, res3
+    vxor.v res4, res4, res4
+    bge $r0, N, .L999
+    li.d  TEMP, 2 * SIZE
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+#ifdef DOUBLE
+    srai.d I, N, 1
+#else
+    srai.d I, N, 2
+#endif
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    .align 3
+
+.L111:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmadd.d res1, x1, x3, res1
+    vfmadd.d res2, x2, x3, res2
+    vfmadd.d res3, x1, x4, res3
+    vfmadd.d res4, x2, x4, res4
+    addi.d X, X, 4 * SIZE
+    addi.d Y, Y, 4 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmadd.s res1, x1, x3, res1
+    vfmadd.s res2, x2, x3, res2
+    vfmadd.s res3, x1, x4, res3
+    vfmadd.s res4, x2, x4, res4
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L996
+    .align 3
+
+.L12:
+    bge $r0, I, .L997
+    .align 3
+
+.L121:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    addi.d X, X, 4 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmadd.d res1, x1, x3, res1
+    vfmadd.d res2, x2, x3, res2
+    vfmadd.d res3, x1, x4, res3
+    vfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    addi.d X, X, 8 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmadd.s res1, x1, x3, res1
+    vfmadd.s res2, x2, x3, res2
+    vfmadd.s res3, x1, x4, res3
+    vfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L21:
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX3, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    addi.d Y, Y, 4 * SIZE
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmadd.d res1, x1, x3, res1
+    vfmadd.d res2, x2, x3, res2
+    vfmadd.d res3, x1, x4, res3
+    vfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    addi.d Y, Y, 8 * SIZE
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmadd.s res1, x1, x3, res1
+    vfmadd.s res2, x2, x3, res2
+    vfmadd.s res3, x1, x4, res3
+    vfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L996
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    vfmadd.d res1, x1, x3, res1
+    vfmadd.d res2, x2, x3, res2
+    vfmadd.d res3, x1, x4, res3
+    vfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    vfmadd.s res1, x1, x3, res1
+    vfmadd.s res2, x2, x3, res2
+    vfmadd.s res3, x1, x4, res3
+    vfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L996:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+    vreplvei.d VX1, res2, 1
+    vfadd.d res2, VX1, res2
+    vreplvei.d VX1, res3, 1
+    vfadd.d res3, VX1, res3
+    vreplvei.d VX1, res4, 1
+    vfadd.d res4, VX1, res4
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+    vreplvei.w VX1, res2, 1
+    vreplvei.w VX2, res2, 2
+    vreplvei.w VX3, res2, 3
+    vfadd.s res2, VX1, res2
+    vfadd.s res2, VX2, res2
+    vfadd.s res2, VX3, res2
+    vreplvei.w VX1, res3, 1
+    vreplvei.w VX2, res3, 2
+    vreplvei.w VX3, res3, 3
+    vfadd.s res3, VX1, res3
+    vfadd.s res3, VX2, res3
+    vfadd.s res3, VX3, res3
+    vreplvei.w VX1, res4, 1
+    vreplvei.w VX2, res4, 2
+    vreplvei.w VX3, res4, 3
+    vfadd.s res4, VX1, res4
+    vfadd.s res4, VX2, res4
+    vfadd.s res4, VX3, res4
+#endif
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 1
+#else
+    andi I, N, 3
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    LD   a3, Y, 0 * SIZE
+    LD   a4, Y, 1 * SIZE
+    MADD s1, a1, a3, s1
+    MADD s2, a2, a3, s2
+    MADD s3, a1, a4, s3
+    MADD s4, a2, a4, s4
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+#ifndef CONJ
+    SUB $f0, s1, s4
+    ADD $f1, s3, s2
+#else
+    ADD $f0, s1, s4
+    SUB $f1, s3, s2
+#endif
+    jirl    $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cgemm_kernel_2x2_lasx.S b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S
new file mode 100644
index 000000000..e07f7dc64
--- /dev/null
+++ b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S
@@ -0,0 +1,857 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LASX vectors */
+#define U0     $xr30
+#define U1     $xr31
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define VALPHAR    $xr28
+#define VALPHAI    $xr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVFMADD
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    XVFADD1       XVFADD
+#define    XVFADD2       XVFADD
+#define    XVFADD3       XVFSUB
+#define    XVFADD4       XVFADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVNMSUB
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    XVFADD1       XVFADD
+#define    XVFADD2       XVFADD
+#define    XVFADD3       XVFADD
+#define    XVFADD4       XVFSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVFMADD
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    XVFADD1       XVFADD
+#define    XVFADD2       XVFSUB
+#define    XVFADD3       XVFADD
+#define    XVFADD4       XVFADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVNMSUB
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    XVFADD1       XVFADD
+#define    XVFADD2       XVFSUB
+#define    XVFADD3       XVFSUB
+#define    XVFADD4       XVFSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    xvldrepl.w  VALPHAR, $sp, 112
+    xvldrepl.w  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   2
+
+    move       J,      $r0
+    srai.d     T0,     N,     1
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/2; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     C3,     OFF,   0x04
+    add.d      A0,     A0,    C3
+    add.d      B0,     B,     C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    srai.d     C2,     TL,     2
+    beq        L,      C2,    .L130
+    blt        C2,     L,     .L130
+
+.L12:  /* for(k=0; k<bk/4; k+=1) */
+    xvld       D0,     A0,    0x00  //a 0-7
+    xvld       D1,     A0,    0x20  //a 8-15
+    xvld       D2,     B0,    0x00  //b 0-7
+    xvld       D3,     B0,    0x20  //b 8-15
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02  //a 0 1 2 3 8 9 10 11
+    xvand.v    D5,     D4,    D4
+    xvshuf4i.w D4,     D4,    0x88  //a 0 2 0 2 8 10 8 10
+    xvshuf4i.w D5,     D5,    0xdd  //a 1 3 1 3 9 11 9 11
+
+    xvand.v    D6,     D1,    D1
+    xvpermi.q  D6,     D0,    0x31  //a 4 5 6 7 12 13 14 15
+    xvand.v    D7,     D6,    D6
+    xvshuf4i.w D6,     D6,    0x88  //a 4 6 4 6 12 14 12 14
+    xvshuf4i.w D7,     D7,    0xdd  //a 5 7 5 7 13 15 13 15
+
+    xvand.v    D8,     D2,    D2
+    xvpermi.q  D8,     D3,    0x02  //b 0 1 2 3 8 9 10 11
+    xvand.v    D9,     D8,    D8
+    xvshuf4i.w D8,     D8,    0xa0  //b 0 0 2 2 8 8 10 10
+    xvshuf4i.w D9,     D9,    0xf5  //a 1 1 3 3 9 9 11 11
+
+    xvand.v    D10,     D3,    D3
+    xvpermi.q  D10,     D2,    0x31  //b 4 5 6 7 12 13 14 15
+    xvand.v    D11,     D10,    D10
+    xvshuf4i.w D10,     D10,    0xa0  //b 4 4 6 6 12 12 14 14
+    xvshuf4i.w D11,     D11,    0xf5  //a 5 5 7 7 13 13 15 15
+
+    XVMADD1      U0,     D4,    D8,     U0  //res0 2 4 6 0 2 4 6
+    XVMADD2      U1,     D5,    D8,     U1  //res1 3 4 7 1 3 4 7
+
+    xvpermi.q    U0,     U0,    0x01
+    xvpermi.q    U1,     U1,    0x01
+    XVMADD1      U0,     D4,    D8,     U0
+    XVMADD2      U1,     D5,    D8,     U1
+
+    XVMADD3      U0,     D5,    D9,     U0
+    XVMADD4      U1,     D4,    D9,     U1
+
+    xvpermi.q    U0,     U0,    0x01
+    xvpermi.q    U1,     U1,    0x01
+    XVMADD3      U0,     D5,    D9,     U0
+    XVMADD4      U1,     D4,    D9,     U1
+
+    XVMADD1      U0,     D6,    D10,     U0  //res0 2 4 6 0 2 4 6
+    XVMADD2      U1,     D7,    D10,     U1  //res1 3 4 7 1 3 4 7
+
+    xvpermi.q    U0,     U0,    0x01
+    xvpermi.q    U1,     U1,    0x01
+    XVMADD1      U0,     D6,    D10,     U0
+    XVMADD2      U1,     D7,    D10,     U1
+
+    XVMADD3      U0,     D7,    D11,     U0
+    XVMADD4      U1,     D6,    D11,     U1
+
+    xvpermi.q    U0,     U0,    0x01
+    xvpermi.q    U1,     U1,    0x01
+    XVMADD3      U0,     D7,    D11,     U0
+    XVMADD4      U1,     D6,    D11,     U1
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L12
+
+.L130:
+    move       L,      $r0
+    andi       C2,     TL,     3
+    beq        L,      C2,    .L14
+
+.L13:  /* for(k=0; k<(bk&3); k+=1) */
+    vld       $vr16,     A0,    0x00  //a0 a1 a2 a3
+    vld       $vr17,     B0,    0x00  //b0 b1 b2 b3
+
+    vshuf4i.w $vr20,     $vr17,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w $vr21,     $vr17,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  $vr18,     $vr16,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  $vr19,     $vr16,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      $vr30,     $vr18,    $vr20,     $vr30  //res0 2 4 6
+    VMADD2      $vr31,     $vr19,    $vr20,     $vr31  //res1 3 5 7
+    VMADD3      $vr30,     $vr19,    $vr21,     $vr30
+    VMADD4      $vr31,     $vr18,    $vr21,     $vr31
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L13
+
+.L14:
+#if defined(TRMMKERNEL)
+    vld       $vr8,     C0,    0x00  //0 1 2 3
+    vld       $vr9,     C1,    0x00  //4 5 6 7
+
+    vpackev.w $vr10,    $vr9,    $vr8    //0 4 2 6
+    vpermi.w  $vr10,    $vr10,   0xd8    //0 2 4 6
+
+    vpackod.w $vr11,    $vr9,    $vr8    //1 5 3 7
+    vpermi.w  $vr11,    $vr11,   0xd8    //1 3 5 7
+
+    vfmul.s      $vr10,    $vr30,    $vr28
+    vfmul.s      $vr11,    $vr31,    $vr28
+    VNMSUB    $vr10,    $vr31,    $vr29, $vr10
+    VFMADD     $vr11,    $vr30,    $vr29, $vr11
+
+    vilvl.w   $vr8,     $vr11,   $vr10  //0 1 2 3
+
+    vilvh.w   $vr9,     $vr11,   $vr10  //4 5 6 7
+
+    vst       $vr8,     C0,    0x00
+    vst       $vr9,     C1,    0x00
+#else
+    vld       $vr8,     C0,    0x00  //0 1 2 3
+    vld       $vr9,     C1,    0x00  //4 5 6 7
+
+    vpackev.w $vr10,    $vr9,    $vr8    //0 4 2 6
+    vpermi.w  $vr10,    $vr10,   0xd8    //0 2 4 6
+
+    vpackod.w $vr11,    $vr9,    $vr8    //1 5 3 7
+    vpermi.w  $vr11,    $vr11,   0xd8    //1 3 5 7
+
+    VFMADD      $vr10,    $vr30,    $vr28, $vr10
+    VFMADD      $vr11,    $vr31,    $vr28, $vr11
+    VNMSUB     $vr10,    $vr31,    $vr29, $vr10
+    VFMADD      $vr11,    $vr30,    $vr29, $vr11
+
+    vilvl.w   $vr8,     $vr11,   $vr10  //0 1 2 3
+
+    vilvh.w   $vr9,     $vr11,   $vr10  //4 5 6 7
+
+    vst       $vr8,     C0,    0x00
+    vst       $vr9,     C1,    0x00
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L18
+
+.L15:  /* for(i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x03
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x04
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         b3,     B0,    0x08        //load4
+    MADD1      c21,    a1,    b3,     c21  //res2
+    MADD2      c22,    a2,    b3,     c22  //res3
+    LD         b4,     B0,    0x0c        //load5
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    MUL        b5,     c21,   ALPHA_R
+    MUL        b6,     c22,   ALPHA_I
+    SUB        b5,     b5,    b6
+    ST         b5,     C1,    0x00
+
+    MUL        b5,     c22,   ALPHA_R
+    MUL        b6,     c21,   ALPHA_I
+    ADD        b6,     b5,    b6
+    ST         b6,     C1,    0x04
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         b5,     C1,    0x00    //C1[0]
+    LD         b6,     C1,    0x04    //C1[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    MADD       b5,     c21,   ALPHA_R, b5
+    MADD       b6,     c22,   ALPHA_R, b6
+    NMSUB      b5,     c22,   ALPHA_I, b5
+    MADD       b6,     c21,   ALPHA_I, b6
+    ST         b5,     C1,    0x00
+    ST         b6,     C1,    0x04
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x03
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x04
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L15
+
+.L18:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+
+    slli.d     L,      K,     0x04
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x02
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     1
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x03
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x08        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x0c        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    MUL        a7,     c21,   ALPHA_R
+    MUL        a8,     c22,   ALPHA_I
+    SUB        a7,     a7,    a8
+    ST         a7,     C0,    0x08
+
+    MUL        a7,     c22,   ALPHA_R
+    MUL        a8,     c21,   ALPHA_I
+    ADD        a8,     a7,    a8
+    ST         a8,     C0,    0x0c
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C0,    0x08    //C1[2]
+    LD         a8,     C0,    0x0c    //C1[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+    ST         a7,     C0,    0x08
+    ST         a8,     C0,    0x0c
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x03
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L28
+
+.L25:  /* for (i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,   0x03
+    add.d      A0,     A0,   C3
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x03
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x08
+
+    addi.d     I,      I,     1
+    blt        I,      T1,    .L25
+
+.L28:
+    slli.d     L,      K,     3
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L20
+
+.L30:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_kernel_2x2_lsx.S b/kernel/loongarch64/cgemm_kernel_2x2_lsx.S
new file mode 100644
index 000000000..ebacf4e93
--- /dev/null
+++ b/kernel/loongarch64/cgemm_kernel_2x2_lsx.S
@@ -0,0 +1,812 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LASX vectors */
+#define U0     $vr30
+#define U1     $vr31
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+#define D0     $vr16
+#define D1     $vr17
+#define D2     $vr18
+#define D3     $vr19
+#define D4     $vr20
+#define D5     $vr21
+#define D6     $vr22
+#define D7     $vr23
+#define D8     $vr24
+#define D9     $vr25
+#define D10    $vr26
+#define D11    $vr27
+#define VALPHAR    $vr28
+#define VALPHAI    $vr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    vldrepl.w  VALPHAR, $sp, 112
+    vldrepl.w  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   2
+
+    move       J,      $r0
+    srai.d     T0,     N,     1
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/2; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     C3,     OFF,   0x04
+    add.d      A0,     A0,    C3
+    add.d      B0,     B,     C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    srai.d     C2,     TL,     2
+    beq        L,      C2,    .L130
+    blt        C2,     L,     .L130
+
+.L12:  /* for(k=0; k<bk/4; k+=1) */
+    vld       D0,     A0,    0x00  //a0 a1 a2 a3
+    vld       D1,     B0,    0x00  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    vld       D0,     A0,    0x10  //a0 a1 a2 a3
+    vld       D1,     B0,    0x10  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    vld       D0,     A0,    0x20  //a0 a1 a2 a3
+    vld       D1,     B0,    0x20  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    vld       D0,     A0,    0x30  //a0 a1 a2 a3
+    vld       D1,     B0,    0x30  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L12
+
+.L130:
+    move       L,      $r0
+    andi       C2,     TL,     3
+    beq        L,      C2,    .L14
+
+.L13:  /* for(k=0; k<(bk&3); k+=1) */
+    vld       D0,     A0,    0x00  //a0 a1 a2 a3
+    vld       D1,     B0,    0x00  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 5 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L13
+
+.L14:
+#if defined(TRMMKERNEL)
+    vld       U8,     C0,    0x00  //0 1 2 3
+    vld       U9,     C1,    0x00  //4 5 6 7
+
+    vpackev.w U10,    U9,    U8    //0 4 2 6
+    vpermi.w  U10,    U10,   0xd8  //0 2 4 6
+
+    vpackod.w U11,    U9,    U8    //1 5 3 7
+    vpermi.w  U11,    U11,   0xd8  //1 3 5 7
+
+    vfmul.s      U10,    U0,    VALPHAR
+    vfmul.s      U11,    U1,    VALPHAR
+    VNMSUB     U10,    U1,    VALPHAI, U10
+    VFMADD      U11,    U0,    VALPHAI, U11
+
+    vilvl.w   U8,     U11,   U10  //0 1 2 3
+
+    vilvh.w   U9,     U11,   U10  //4 5 6 7
+
+    vst       U8,     C0,    0x00
+    vst       U9,     C1,    0x00
+#else
+    vld       U8,     C0,    0x00  //0 1 2 3
+    vld       U9,     C1,    0x00  //4 5 6 7
+
+    vpackev.w U10,    U9,    U8    //0 4 2 6
+    vpermi.w  U10,    U10,   0xd8  //0 2 4 6
+
+    vpackod.w U11,    U9,    U8    //1 5 3 7
+    vpermi.w  U11,    U11,   0xd8  //1 3 5 7
+
+    VFMADD      U10,    U0,    VALPHAR, U10
+    VFMADD      U11,    U1,    VALPHAR, U11
+    VNMSUB     U10,    U1,    VALPHAI, U10
+    VFMADD      U11,    U0,    VALPHAI, U11
+
+    vilvl.w   U8,     U11,   U10  //0 1 2 3
+
+    vilvh.w   U9,     U11,   U10  //4 5 6 7
+
+    vst       U8,     C0,    0x00
+    vst       U9,     C1,    0x00
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L18
+
+.L15:  /* for(i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x03
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x04
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         b3,     B0,    0x08        //load4
+    MADD1      c21,    a1,    b3,     c21  //res2
+    MADD2      c22,    a2,    b3,     c22  //res3
+    LD         b4,     B0,    0x0c        //load5
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    MUL        b5,     c21,   ALPHA_R
+    MUL        b6,     c22,   ALPHA_I
+    SUB        b5,     b5,    b6
+    ST         b5,     C1,    0x00
+
+    MUL        b5,     c22,   ALPHA_R
+    MUL        b6,     c21,   ALPHA_I
+    ADD        b6,     b5,    b6
+    ST         b6,     C1,    0x04
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         b5,     C1,    0x00    //C1[0]
+    LD         b6,     C1,    0x04    //C1[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    MADD       b5,     c21,   ALPHA_R, b5
+    MADD       b6,     c22,   ALPHA_R, b6
+    NMSUB      b5,     c22,   ALPHA_I, b5
+    MADD       b6,     c21,   ALPHA_I, b6
+    ST         b5,     C1,    0x00
+    ST         b6,     C1,    0x04
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x03
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x04
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L15
+
+.L18:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+
+    slli.d     L,      K,     0x04
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x02
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     1
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x03
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x08        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x0c        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    MUL        a7,     c21,   ALPHA_R
+    MUL        a8,     c22,   ALPHA_I
+    SUB        a7,     a7,    a8
+    ST         a7,     C0,    0x08
+
+    MUL        a7,     c22,   ALPHA_R
+    MUL        a8,     c21,   ALPHA_I
+    ADD        a8,     a7,    a8
+    ST         a8,     C0,    0x0c
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C0,    0x08    //C1[2]
+    LD         a8,     C0,    0x0c    //C1[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+    ST         a7,     C0,    0x08
+    ST         a8,     C0,    0x0c
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x03
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L28
+
+.L25:  /* for (i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,   0x03
+    add.d      A0,     A0,   C3
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x03
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x08
+
+    addi.d     I,      I,     1
+    blt        I,      T1,    .L25
+
+.L28:
+    slli.d     L,      K,     3
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L20
+
+.L30:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_ncopy_2_lasx.S b/kernel/loongarch64/cgemm_ncopy_2_lasx.S
new file mode 100644
index 000000000..10dc48c85
--- /dev/null
+++ b/kernel/loongarch64/cgemm_ncopy_2_lasx.S
@@ -0,0 +1,193 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+#define D8     $xr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x02  //lda
+    slli.d     TL,   TL,   0x01
+    slli.d     T0,   TL,   0x01
+
+    srai.d     I,    N,    0x01
+    beq        I,    ZERO,  .L_N0
+
+.L_J1: /* if (i > 0) I-- */
+    move       S1,   TS         //a_offset1
+    add.d      S2,   TS,   TL   //a_offset2
+    srai.d     J,    M,    0x02
+    add.d      TS,   TS,   T0
+
+    beq        J,    ZERO,  .L_I3
+
+.L_I1: /* if (j > 0) J-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x00
+    xvld       U2,   S2,   0x00
+
+    xvpermi.q  U0,   U2,   0x02
+    xvpermi.q  U2,   U1,   0x31
+
+    xvpermi.d  U0,   U0,   0xd8
+    xvpermi.d  U2,   U2,   0xd8
+
+    xvst       U0,   TD,   0x00
+    xvst       U2,   TD,   0x20
+
+    addi.d     S1,   S1,   0x20   // a_offset1
+    addi.d     S2,   S2,   0x20
+    addi.d     TD,   TD,   0x40  // b_offset
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_I1
+
+.L_I3:
+    andi       J,    M,    0x03
+    beq        J,    ZERO, .L_II20
+
+.L_II1:  /* j = (m & 3)  if (j > 0) */
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     TD,   TD,   0x10
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_II1
+
+.L_II20:
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_J1
+
+.L_N0:  /* if(n&1)*/
+    andi       I,     N,   0x01
+    beq        ZERO,  I,   .L_N00
+
+.L_N1:
+    srai.d     J,     M,   0x02
+    beq        ZERO,  J,   .L_N10
+
+.L_N11: /* j = (m >> 2) if (j > 0) */
+    xvld       U0,    TS,   0x00
+
+    xvst       U0,    TD,   0x00
+
+    addi.d     TS,    TS,   0x20   // a_offset
+    addi.d     TD,    TD,   0x20   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N11
+
+.L_N10:
+    andi       J,     M,    0x03
+    beq        J,     ZERO, .L_N00
+
+.L_N12:  /* j = (m & 3)  if (j > 0) */
+    fld.s      F0,    TS,   0x00
+    fld.s      F1,    TS,   0x04
+
+    fst.s      F0,    TD,   0x00
+    fst.s      F1,    TD,   0x04
+
+    addi.d     TS,    TS,   0x08   // a_offset
+    addi.d     TD,    TD,   0x08   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N12
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_ncopy_2_lsx.S b/kernel/loongarch64/cgemm_ncopy_2_lsx.S
new file mode 100644
index 000000000..1cf4d87dc
--- /dev/null
+++ b/kernel/loongarch64/cgemm_ncopy_2_lsx.S
@@ -0,0 +1,202 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+#define D8     $vr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x02  //lda
+    slli.d     TL,   TL,   0x01
+    slli.d     T0,   TL,   0x01
+
+    srai.d     I,    N,    0x01
+    beq        I,    ZERO,  .L_N0
+
+.L_J1: /* if (i > 0) I-- */
+    move       S1,   TS         //a_offset1
+    add.d      S2,   TS,   TL   //a_offset2
+    srai.d     J,    M,    0x02
+    add.d      TS,   TS,   T0
+
+    beq        J,    ZERO,  .L_I3
+
+.L_I1: /* if (j > 0) J-- */
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vand.v    D0,   U2,   U2
+    vand.v    D1,   U3,   U3
+    vand.v    D2,   U2,   U2
+    vand.v    D3,   U3,   U3
+
+    vpermi.w  D0,   U0,   0x44
+    vpermi.w  D2,   U0,   0xee
+    vpermi.w  D1,   U1,   0x44
+    vpermi.w  D3,   U1,   0xee
+
+    vst       D0,   TD,   0x00
+    vst       D2,   TD,   0x10
+    vst       D1,   TD,   0x20
+    vst       D3,   TD,   0x30
+
+    addi.d     S1,   S1,   0x20   // a_offset1
+    addi.d     S2,   S2,   0x20
+    addi.d     TD,   TD,   0x40  // b_offset
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_I1
+
+.L_I3:
+    andi       J,    M,    0x03
+    beq        J,    ZERO, .L_II20
+
+.L_II1:  /* j = (m & 3)  if (j > 0) */
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     TD,   TD,   0x10
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_II1
+
+.L_II20:
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_J1
+
+.L_N0:  /* if(n&1)*/
+    andi       I,     N,   0x01
+    beq        ZERO,  I,   .L_N00
+
+.L_N1:
+    srai.d     J,     M,   0x02
+    beq        ZERO,  J,   .L_N10
+
+.L_N11: /* j = (m >> 2) if (j > 0) */
+    vld       U0,    TS,   0x00
+    vld       U1,    TS,   0x10
+
+    vst       U0,    TD,   0x00
+    vst       U1,    TD,   0x10
+
+    addi.d     TS,    TS,   0x20   // a_offset
+    addi.d     TD,    TD,   0x20   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N11
+
+.L_N10:
+    andi       J,     M,    0x03
+    beq        J,     ZERO, .L_N00
+
+.L_N12:  /* j = (m & 3)  if (j > 0) */
+    fld.s      F0,    TS,   0x00
+    fld.s      F1,    TS,   0x04
+
+    fst.s      F0,    TD,   0x00
+    fst.s      F1,    TD,   0x04
+
+    addi.d     TS,    TS,   0x08   // a_offset
+    addi.d     TD,    TD,   0x08   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N12
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_tcopy_2_lasx.S b/kernel/loongarch64/cgemm_tcopy_2_lasx.S
new file mode 100644
index 000000000..e2245e412
--- /dev/null
+++ b/kernel/loongarch64/cgemm_tcopy_2_lasx.S
@@ -0,0 +1,218 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r7
+#define T0     $r18
+#define S8     $r19
+#define S9     $r20
+#define S10    $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x02  //lda
+    slli.d     TL,     TL,    0x01
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x02
+    add.d      S9,     DST,   T0  //boffset2
+
+    srai.d     J,      M,     0x01  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01
+    add.d      S2,     S1,    TL   //aoffset2
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x20
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x00
+    xvld       U2,   S2,   0x00
+
+    xvpermi.q  U0,     U2,   0x02
+    xvpermi.q  U2,     U1,   0x31
+
+    xvst       U0,   S8,   0x00
+
+    slli.d     T0,   M,    0x04
+    add.d      S8,   S8,   T0
+
+    xvst       U2,   S8,   0x00
+
+    add.d      S8,   S8,   T0
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    vld        $vr0,   S1,   0x00
+    vld        $vr1,   S2,   0x00
+
+    vst        $vr0,   S8,   0x00
+    vst        $vr1,   S8,   0x10
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fst.s      F0,   S9,   0x00
+    fst.s      F1,   S9,   0x04
+    fst.s      F2,   S9,   0x08
+    fst.s      F3,   S9,   0x0c
+
+    addi.d     S9,   S9,   0x10
+
+.L_J0:
+    addi.d     J,    J,    -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    vld        $vr0,     TS,    0x00
+    vld        $vr1,     TS,    0x10
+
+    vst       $vr0,     TD,    0x00
+
+    slli.d     T0,     M,     0x04
+    add.d      TD,     TD,    T0
+
+    vst       $vr1,     TD,    0x00
+
+    add.d      TD,     TD,    T0
+    addi.d     TS,     TS,    0x20
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    vld        $vr0,     TS,    0x00
+
+    vst        $vr0,     TD,    0x00
+
+    addi.d     TS,     TS,    0x10
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    fld.s      F0,     TS,    0x00
+    fld.s      F1,     TS,    0x04
+
+    fst.s      F0,     S9,    0x00
+    fst.s      F1,     S9,    0x04
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_tcopy_2_lsx.S b/kernel/loongarch64/cgemm_tcopy_2_lsx.S
new file mode 100644
index 000000000..15c0fde8f
--- /dev/null
+++ b/kernel/loongarch64/cgemm_tcopy_2_lsx.S
@@ -0,0 +1,218 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r7
+#define T0     $r18
+#define S8     $r19
+#define S9     $r20
+#define S10    $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x02  //lda
+    slli.d     TL,     TL,    0x01
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x02
+    add.d      S9,     DST,   T0  //boffset2
+
+    srai.d     J,      M,     0x01  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01
+    add.d      S2,     S1,    TL   //aoffset2
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x20
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vst       U0,   S8,   0x00
+    vst       U2,   S8,   0x10
+
+    slli.d     T0,   M,    0x04
+    add.d      S8,   S8,   T0
+
+    vst       U1,   S8,   0x00
+    vst       U3,   S8,   0x10
+
+    add.d      S8,   S8,   T0
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    vld       U0,   S1,   0x00
+    vld       U1,   S2,   0x00
+
+    vst       U0,   S8,   0x00
+    vst       U1,   S8,   0x10
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fst.s      F0,   S9,   0x00
+    fst.s      F1,   S9,   0x04
+    fst.s      F2,   S9,   0x08
+    fst.s      F3,   S9,   0x0c
+
+    addi.d     S9,   S9,   0x10
+
+.L_J0:
+    addi.d     J,    J,    -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    vld       U0,     TS,    0x00
+    vld       U1,     TS,    0x10
+
+    vst       U0,     TD,    0x00
+
+    slli.d     T0,     M,     0x04
+    add.d      TD,     TD,    T0
+
+    vst       U1,     TD,    0x00
+
+    add.d      TD,     TD,    T0
+    addi.d     TS,     TS,    0x20
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    vld       U0,     TS,    0x00
+
+    vst       U0,     TD,    0x00
+
+    addi.d     TS,     TS,    0x10
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    fld.s      F0,     TS,    0x00
+    fld.s      F1,     TS,    0x04
+
+    fst.s      F0,     S9,    0x00
+    fst.s      F1,     S9,    0x04
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cnrm2_lasx.S b/kernel/loongarch64/cnrm2_lasx.S
new file mode 100644
index 000000000..3a60069ac
--- /dev/null
+++ b/kernel/loongarch64/cnrm2_lasx.S
@@ -0,0 +1,147 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+#define a1     $f15
+#define a2     $f16
+#define res    $f19
+#define VX0    $xr15
+#define VX1    $xr16
+#define VX2    $xr17
+#define VX3    $xr18
+#define VX4    $xr21
+#define res1   $xr19
+#define res2   $xr20
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   xvxor.v res1, res1, res1
+   xvxor.v res2, res2, res2
+   bge $r0, N, .L999
+   beq $r0, INCX, .L999
+   li.d  TEMP, SIZE
+   slli.d INCX, INCX, ZBASE_SHIFT
+   srai.d I, N, 2
+   bne INCX, TEMP, .L20
+   bge $r0,  I, .L997
+   .align 3
+
+.L10:
+   xvld VX0, X, 0 * SIZE
+   xvfcvtl.d.s VX1, VX0
+   xvfcvth.d.s VX2, VX0
+   xvfmadd.d res1, VX1, VX1, res1
+   xvfmadd.d res2, VX2, VX2, res2
+   addi.d I, I, -1
+   addi.d X, X, 8 * SIZE
+   blt $r0, I, .L10
+   .align 3
+   b .L996
+
+.L20:
+   bge $r0, I, .L997
+   .align 3
+
+.L21:
+   ld.w t1, X, 0 * SIZE
+   ld.w t2, X, 1 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   ld.w t4, X, 1 * SIZE
+   add.d X, X, INCX
+   xvinsgr2vr.w VX0, t1, 0
+   xvinsgr2vr.w VX0, t2, 1
+   xvinsgr2vr.w VX0, t3, 2
+   xvinsgr2vr.w VX0, t4, 3
+   ld.w t1, X, 0 * SIZE
+   ld.w t2, X, 1 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   ld.w t4, X, 1 * SIZE
+   xvinsgr2vr.w VX0, t1, 4
+   xvinsgr2vr.w VX0, t2, 5
+   xvinsgr2vr.w VX0, t3, 6
+   xvinsgr2vr.w VX0, t4, 7
+   add.d X, X, INCX
+   xvfcvtl.d.s VX1, VX0
+   xvfcvth.d.s VX2, VX0
+   xvfmadd.d res1, VX1, VX1, res1
+   xvfmadd.d res2, VX2, VX2, res2
+   addi.d  I, I, -1
+   blt $r0, I, .L21
+   b .L996
+
+.L996:
+   xvfadd.d res1, res1, res2
+   xvpickve.d VX1, res1, 1
+   xvpickve.d VX2, res1, 2
+   xvpickve.d VX3, res1, 3
+   xvfadd.d res1, VX1, res1
+   xvfadd.d res1, VX2, res1
+   xvfadd.d res1, VX3, res1
+   .align 3
+
+.L997:
+   andi I, N, 3
+   bge $r0, I, .L999
+   .align 3
+
+.L998:
+   fld.s a1, X, 0 * SIZE
+   fld.s a2, X, 1 * SIZE
+   addi.d I, I, -1
+   fcvt.d.s a1, a1
+   fcvt.d.s a2, a2
+   fmadd.d res, a1, a1, res
+   fmadd.d res, a2, a2, res
+   add.d X, X, INCX
+   blt $r0, I, .L998
+   .align 3
+
+.L999:
+   fsqrt.d res, res
+   move $r4, $r17
+   fcvt.s.d $f0, res
+   jirl $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/cnrm2_lsx.S b/kernel/loongarch64/cnrm2_lsx.S
new file mode 100644
index 000000000..20950ba17
--- /dev/null
+++ b/kernel/loongarch64/cnrm2_lsx.S
@@ -0,0 +1,155 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+#define a1     $f15
+#define a2     $f16
+#define res    $f19
+#define VX0    $vr15
+#define VX1    $vr16
+#define VX2    $vr17
+#define VX3    $vr18
+#define VX4    $vr21
+#define res1   $vr19
+#define res2   $vr20
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   vxor.v res1, res1, res1
+   vxor.v res2, res2, res2
+   bge $r0, N, .L999
+   beq $r0, INCX, .L999
+   li.d  TEMP, 1
+   slli.d TEMP, TEMP, ZBASE_SHIFT
+   slli.d INCX, INCX, ZBASE_SHIFT
+   srai.d I, N, 2
+   bne INCX, TEMP, .L20
+   bge $r0,  I, .L997
+   .align 3
+
+.L10:
+   vld VX0, X, 0 * SIZE
+   vfcvtl.d.s VX1, VX0
+   vfcvth.d.s VX2, VX0
+   vfmadd.d res1, VX1, VX1, res1
+   vfmadd.d res2, VX2, VX2, res2
+   vld VX0, X, 4 * SIZE
+   vfcvtl.d.s VX3, VX0
+   vfcvth.d.s VX4, VX0
+   vfmadd.d res1, VX3, VX3, res1
+   vfmadd.d res2, VX4, VX4, res2
+   addi.d I, I, -1
+   addi.d X, X, 8 * SIZE
+   blt $r0, I, .L10
+   b .L996
+   .align 3
+
+.L20:
+   bge $r0, I, .L997
+   .align 3
+
+.L21:
+   ld.w t1, X, 0 * SIZE
+   ld.w t2, X, 1 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   ld.w t4, X, 1 * SIZE
+   vinsgr2vr.w VX0, t1, 0
+   vinsgr2vr.w VX0, t2, 1
+   vinsgr2vr.w VX0, t3, 2
+   vinsgr2vr.w VX0, t4, 3
+   add.d X, X, INCX
+   vfcvtl.d.s VX1, VX0
+   vfcvth.d.s VX2, VX0
+   vfmadd.d res1, VX1, VX1, res1
+   vfmadd.d res2, VX2, VX2, res2
+   ld.w t1, X, 0 * SIZE
+   ld.w t2, X, 1 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   ld.w t4, X, 1 * SIZE
+   vinsgr2vr.w VX0, t1, 0
+   vinsgr2vr.w VX0, t2, 1
+   vinsgr2vr.w VX0, t3, 2
+   vinsgr2vr.w VX0, t4, 3
+   add.d X, X, INCX
+   vfcvtl.d.s VX3, VX0
+   vfcvth.d.s VX4, VX0
+   vfmadd.d res1, VX3, VX3, res1
+   vfmadd.d res2, VX4, VX4, res2
+   addi.d  I, I, -1
+   blt $r0, I, .L21
+   b .L996
+   .align 3
+
+.L996:
+   vfadd.d res1, res1, res2
+   vreplvei.d VX1, res1, 1
+   vfadd.d res1, VX1, res1
+   .align 3
+
+.L997:
+   andi I, N, 3
+   bge $r0, I, .L999
+   .align 3
+
+.L998:
+    fld.s a1, X, 0 * SIZE
+    fld.s a2, X, 1 * SIZE
+    addi.d I, I, -1
+    fcvt.d.s a1, a1
+    fcvt.d.s a2, a2
+    fmadd.d res, a1, a1, res
+    fmadd.d res, a2, a2, res
+    add.d X, X, INCX
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    fsqrt.d res, res
+    move $r4, $r17
+    fcvt.s.d $f0, $f19
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/copy_lasx.S b/kernel/loongarch64/copy_lasx.S
new file mode 100644
index 000000000..31f91cec1
--- /dev/null
+++ b/kernel/loongarch64/copy_lasx.S
@@ -0,0 +1,306 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $xr12
+#define VX1    $xr13
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+/* INCX==1 and INCY==1 */
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    xvld VX0, X, 0
+    addi.d  I, I, -1
+    xvst VX0, Y, 0
+#ifdef DOUBLE
+    xvld VX0, X, 32
+    xvst VX0, Y, 32
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    ST    $f12, Y, 0
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+/* INCX==1 and INCY!=1 */
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    xvstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0, 3
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 3
+    add.d Y, Y, INCY
+#else
+    xvld VX0, X, 0
+    xvstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 3
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 4
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 5
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 6
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 7
+    add.d Y, Y, INCY
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    ST    $f12, Y, 0
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY==1 */
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvst VX0, Y, 0
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvst VX1, Y, 32
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvst VX0, Y, 0
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    ST    $f12, Y, 0
+    add.d  X, X, INCX
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY!=1 */
+.L22:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    ST    a3, X, 0
+    add.d Y, Y, INCY
+    ST    a4, X, 0
+    add.d Y, Y, INCY
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    ST    a3, X, 0
+    add.d Y, Y, INCY
+    ST    a4, X, 0
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    ST    $f12, Y, 0
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/copy_lsx.S b/kernel/loongarch64/copy_lsx.S
new file mode 100644
index 000000000..bb10f3565
--- /dev/null
+++ b/kernel/loongarch64/copy_lsx.S
@@ -0,0 +1,316 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $vr12
+#define VX1    $vr13
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+/* INCX==1 and INCY==1 */
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    vld VX0, X, 0
+    vld VX1, X, 16
+    addi.d  I, I, -1
+    vst VX0, Y, 0
+    vst VX1, Y, 16
+#ifdef DOUBLE
+    vld VX0, X, 32
+    vld VX1, X, 48
+    vst VX0, Y, 32
+    vst VX1, Y, 48
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    ST    $f12, Y, 0
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+/* INCX==1 and INCY!=1 */
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    vld VX0, X, 32
+    vld VX1, X, 48
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+#else
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0, 3
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 3
+    add.d Y, Y, INCY
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    ST    $f12, Y, 0
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY==1 */
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 0
+    vst VX1, Y, 16
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 32
+    vst VX1, Y, 48
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vst VX0, Y, 0
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vst VX1, Y, 16
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    ST    $f12, Y, 0
+    add.d  X, X, INCX
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY!=1 */
+.L22:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    ST    a3, X, 0
+    add.d Y, Y, INCY
+    ST    a4, X, 0
+    add.d Y, Y, INCY
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    ST    a3, X, 0
+    add.d Y, Y, INCY
+    ST    a4, X, 0
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    ST    $f12, Y, 0
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/crot_lasx.S b/kernel/loongarch64/crot_lasx.S
new file mode 100644
index 000000000..d4ec1e22c
--- /dev/null
+++ b/kernel/loongarch64/crot_lasx.S
@@ -0,0 +1,1079 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define C      $f0
+#define S      $f1
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VT0    $xr10
+#define VT1    $xr18
+#define VXC    $xr23
+#define VXS    $xr9
+#define VXZ    $xr11
+#define x1     $xr12
+#define x2     $xr13
+#define x3     $xr14
+#define x4     $xr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT  a1, a1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    MTG  t1, C
+    MTG  t2, S
+    MTG  t3, a1
+#ifdef DOUBLE
+    xvreplgr2vr.d VXC, t1
+    xvreplgr2vr.d VXS, t2
+    xvreplgr2vr.d VXZ, t3
+    srai.d I, N, 2
+#else
+    xvreplgr2vr.w VXC, t1
+    xvreplgr2vr.w VXS, t2
+    xvreplgr2vr.w VXZ, t3
+    srai.d I, N, 3
+#endif
+    beq INCX, $r0, .L996
+    beq INCY, $r0, .L996
+    bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1
+    bne INCY, TEMP, .L22
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L110
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L112 // C!=0 S==0
+    b .L111 // C!=0 S!=0
+    .align 3
+
+.L110:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L114 // C==0 S==0
+    b .L113 // C==0 S!=0
+    .align 3
+
+.L111: // C!=0 S!=0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmul.d VX0, x1, VXC
+    xvfmadd.d VX0, x3, VXS, VX0
+    xvfmul.d VX1, x1, VXS
+    xvfmsub.d VX1, x3, VXC, VX1
+    xvfmul.d VX2, x2, VXC
+    xvfmadd.d VX2, x4, VXS, VX2
+    xvfmul.d VX3, x2, VXS
+    xvfmsub.d VX3, x4, VXC, VX3
+    xvilvl.d x1, VX2 ,VX0
+    xvilvh.d x2, VX2, VX0
+    xvilvl.d x3, VX3 ,VX1
+    xvilvh.d x4, VX3, VX1
+    xvst x1, X, 0 * SIZE
+    xvst x3, Y, 0 * SIZE
+    xvst x2, X, 4 * SIZE
+    xvst x4, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    xvfmul.s VX0, x1, VXC
+    xvfmadd.s VX0, x3, VXS, VX0
+    xvfmul.s VX1, x1, VXS
+    xvfmsub.s VX1, x3, VXC, VX1
+    xvfmul.s VX2, x2, VXC
+    xvfmadd.s VX2, x4, VXS, VX2
+    xvfmul.s VX3, x2, VXS
+    xvfmsub.s VX3, x4, VXC, VX3
+    xvilvl.w x1, VX2 ,VX0
+    xvilvh.w x2, VX2, VX0
+    xvilvl.w x3, VX3 ,VX1
+    xvilvh.w x4, VX3, VX1
+    xvst x1, X, 0 * SIZE
+    xvst x3, Y, 0 * SIZE
+    xvst x2, X, 8 * SIZE
+    xvst x4, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // C!=0 S==0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmul.d VX0, x1, VXC
+    xvfmul.d VX1, x3, VXC
+    xvfmul.d VX2, x2, VXC
+    xvfmul.d VX3, x4, VXC
+    xvilvl.d x1, VX2 ,VX0
+    xvilvh.d x2, VX2, VX0
+    xvilvl.d x3, VX3 ,VX1
+    xvilvh.d x4, VX3, VX1
+    xvst x1, X, 0 * SIZE
+    xvst x3, Y, 0 * SIZE
+    xvst x2, X, 4 * SIZE
+    xvst x4, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    xvfmul.s VX0, x1, VXC
+    xvfmul.s VX1, x3, VXC
+    xvfmul.s VX2, x2, VXC
+    xvfmul.s VX3, x4, VXC
+    xvilvl.w x1, VX2 ,VX0
+    xvilvh.w x2, VX2, VX0
+    xvilvl.w x3, VX3 ,VX1
+    xvilvh.w x4, VX3, VX1
+    xvst x1, X, 0 * SIZE
+    xvst x3, Y, 0 * SIZE
+    xvst x2, X, 8 * SIZE
+    xvst x4, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // C==0 S!=0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmul.d VX0, x3, VXS
+    xvfmul.d VX1, x1, VXS
+    xvfsub.d VX1, VXZ, VX1
+    xvfmul.d VX2, x4, VXS
+    xvfmul.d VX3, x2, VXS
+    xvfsub.d VX3, VXZ, VX3
+    xvilvl.d x1, VX2 ,VX0
+    xvilvh.d x2, VX2, VX0
+    xvilvl.d x3, VX3 ,VX1
+    xvilvh.d x4, VX3, VX1
+    xvst x1, X, 0 * SIZE
+    xvst x3, Y, 0 * SIZE
+    xvst x2, X, 4 * SIZE
+    xvst x4, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    xvfmul.s VX0, x3, VXS
+    xvfmul.s VX1, x1, VXS
+    xvfsub.s VX1, VXZ, VX1
+    xvfmul.s VX2, x4, VXS
+    xvfmul.s VX3, x2, VXS
+    xvfsub.s VX3, VXZ, VX3
+    xvilvl.w x1, VX2 ,VX0
+    xvilvh.w x2, VX2, VX0
+    xvilvl.w x3, VX3 ,VX1
+    xvilvh.w x4, VX3, VX1
+    xvst x1, X, 0 * SIZE
+    xvst x3, Y, 0 * SIZE
+    xvst x2, X, 8 * SIZE
+    xvst x4, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // C==0 S==0
+    xvst VXZ, X, 0 * SIZE
+    xvst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvst VXZ, X, 4 * SIZE
+    xvst VXZ, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvst VXZ, X, 8 * SIZE
+    xvst VXZ, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L220
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L222 // C!=0 S==0
+    b .L221 // C!=0 S!=0
+    .align 3
+
+.L220:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L224 // C==0 S==0
+    b .L223 // C==0 S!=0
+    .align 3
+
+.L221: // C!=0 S!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+
+    xvfmul.d VX0, x1, VXC
+    xvfmadd.d VX0, x3, VXS, VX0
+    xvfmul.d VX1, x1, VXS
+    xvfmsub.d VX1, x3, VXC, VX1
+    xvfmul.d VX2, x2, VXC
+    xvfmadd.d VX2, x4, VXS, VX2
+    xvfmul.d VX3, x2, VXS
+    xvfmsub.d VX3, x4, VXC, VX3
+    xvstelm.d VX0, XX, 0, 0
+    xvstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 1
+    xvstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 2
+    xvstelm.d VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 3
+    xvstelm.d VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.d VX1, YY, 0, 0
+    xvstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 1
+    xvstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 2
+    xvstelm.d VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 3
+    xvstelm.d VX3, YY, 1 * SIZE, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+
+    xvfmul.s VX0, x1, VXC
+    xvfmadd.s VX0, x3, VXS, VX0
+    xvfmul.s VX1, x1, VXS
+    xvfmsub.s VX1, x3, VXC, VX1
+    xvfmul.s VX2, x2, VXC
+    xvfmadd.s VX2, x4, VXS, VX2
+    xvfmul.s VX3, x2, VXS
+    xvfmsub.s VX3, x4, VXC, VX3
+    xvstelm.w VX0, XX, 0, 0
+    xvstelm.w VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 1
+    xvstelm.w VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 2
+    xvstelm.w VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 3
+    xvstelm.w VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.w VX1, YY, 0, 0
+    xvstelm.w VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 1
+    xvstelm.w VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 2
+    xvstelm.w VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 3
+    xvstelm.w VX3, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, XX, 0, 4
+    xvstelm.w VX2, XX, 1 * SIZE, 4
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 5
+    xvstelm.w VX2, XX, 1 * SIZE, 5
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 6
+    xvstelm.w VX2, XX, 1 * SIZE, 6
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 7
+    xvstelm.w VX2, XX, 1 * SIZE, 7
+    add.d XX, XX, INCX
+    xvstelm.w VX1, YY, 0, 4
+    xvstelm.w VX3, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 5
+    xvstelm.w VX3, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 6
+    xvstelm.w VX3, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 7
+    xvstelm.w VX3, YY, 1 * SIZE, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // C!=0 S==0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX0, x1, VXC
+    xvfmul.d VX1, x3, VXC
+    xvfmul.d VX2, x2, VXC
+    xvfmul.d VX3, x4, VXC
+    xvstelm.d VX0, XX, 0, 0
+    xvstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 1
+    xvstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 2
+    xvstelm.d VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 3
+    xvstelm.d VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.d VX1, YY, 0, 0
+    xvstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 1
+    xvstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 2
+    xvstelm.d VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 3
+    xvstelm.d VX3, YY, 1 * SIZE, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX0, x1, VXC
+    xvfmul.s VX1, x3, VXC
+    xvfmul.s VX2, x2, VXC
+    xvfmul.s VX3, x4, VXC
+    xvstelm.w VX0, XX, 0, 0
+    xvstelm.w VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 1
+    xvstelm.w VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 2
+    xvstelm.w VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 3
+    xvstelm.w VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.w VX1, YY, 0, 0
+    xvstelm.w VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 1
+    xvstelm.w VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 2
+    xvstelm.w VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 3
+    xvstelm.w VX3, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, XX, 0, 4
+    xvstelm.w VX2, XX, 1 * SIZE, 4
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 5
+    xvstelm.w VX2, XX, 1 * SIZE, 5
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 6
+    xvstelm.w VX2, XX, 1 * SIZE, 6
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 7
+    xvstelm.w VX2, XX, 1 * SIZE, 7
+    add.d XX, XX, INCX
+    xvstelm.w VX1, YY, 0, 4
+    xvstelm.w VX3, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 5
+    xvstelm.w VX3, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 6
+    xvstelm.w VX3, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 7
+    xvstelm.w VX3, YY, 1 * SIZE, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // C==0 S!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX0, x3, VXS
+    xvfmul.d VX1, x1, VXS
+    xvfsub.d VX1, VXZ, VX1
+    xvfmul.d VX2, x4, VXS
+    xvfmul.d VX3, x2, VXS
+    xvfsub.d VX3, VXZ, VX3
+    xvstelm.d VX0, XX, 0, 0
+    xvstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 1
+    xvstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 2
+    xvstelm.d VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 3
+    xvstelm.d VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.d VX1, YY, 0, 0
+    xvstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 1
+    xvstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 2
+    xvstelm.d VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 3
+    xvstelm.d VX3, YY, 1 * SIZE, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX0, x3, VXS
+    xvfmul.s VX1, x1, VXS
+    xvfsub.s VX1, VXZ, VX1
+    xvfmul.s VX2, x4, VXS
+    xvfmul.s VX3, x2, VXS
+    xvfsub.s VX3, VXZ, VX3
+    xvstelm.w VX0, XX, 0, 0
+    xvstelm.w VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 1
+    xvstelm.w VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 2
+    xvstelm.w VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 3
+    xvstelm.w VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.w VX1, YY, 0, 0
+    xvstelm.w VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 1
+    xvstelm.w VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 2
+    xvstelm.w VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 3
+    xvstelm.w VX3, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, XX, 0, 4
+    xvstelm.w VX2, XX, 1 * SIZE, 4
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 5
+    xvstelm.w VX2, XX, 1 * SIZE, 5
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 6
+    xvstelm.w VX2, XX, 1 * SIZE, 6
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 7
+    xvstelm.w VX2, XX, 1 * SIZE, 7
+    add.d XX, XX, INCX
+    xvstelm.w VX1, YY, 0, 4
+    xvstelm.w VX3, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 5
+    xvstelm.w VX3, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 6
+    xvstelm.w VX3, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 7
+    xvstelm.w VX3, YY, 1 * SIZE, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // C==0 S==0
+#ifdef DOUBLE
+    xvstelm.d VXZ, XX, 0, 0
+    xvstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 0
+    xvstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 0
+    xvstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 0
+    xvstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, YY, 0, 0
+    xvstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 0
+    xvstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 0
+    xvstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 0
+    xvstelm.d VXZ, YY, 1 * SIZE, 0
+#else
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    move X, XX
+    move Y, YY
+    b .L997
+    .align 3
+
+.L996:
+    move I, N
+    b .L998
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 3
+#else
+    andi I, N, 7
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    LD   a3, Y, 0 * SIZE
+    LD   a4, Y, 1 * SIZE
+    MUL  s1, a1, C
+    MADD s1, a3, S, s1
+    MUL  s2, a1, S
+    MSUB s2, a3, C, s2
+    MUL  s3, a2, C
+    MADD s3, a4, S, s3
+    MUL  s4, a2, S
+    MSUB s4, a4, C, s4
+    addi.d I, I, -1
+    ST   s1, X, 0 * SIZE
+    ST   s2, Y, 0 * SIZE
+    ST   s3, X, 1 * SIZE
+    ST   s4, Y, 1 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/crot_lsx.S b/kernel/loongarch64/crot_lsx.S
new file mode 100644
index 000000000..126257edc
--- /dev/null
+++ b/kernel/loongarch64/crot_lsx.S
@@ -0,0 +1,907 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define C      $f0
+#define S      $f1
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VT0    $vr10
+#define VT1    $vr18
+#define VXC    $vr23
+#define VXS    $vr9
+#define VXZ    $vr11
+#define x1     $vr12
+#define x2     $vr13
+#define x3     $vr14
+#define x4     $vr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT a1, a1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    MTG  t1, C
+    MTG  t2, S
+    MTG  t3, a1
+#ifdef DOUBLE
+    vreplgr2vr.d VXC, t1
+    vreplgr2vr.d VXS, t2
+    vreplgr2vr.d VXZ, t3
+#else
+    vreplgr2vr.w VXC, t1
+    vreplgr2vr.w VXS, t2
+    vreplgr2vr.w VXZ, t3
+    srai.d I, N, 2
+#endif
+    beq INCX, $r0, .L996
+    beq INCY, $r0, .L996
+    bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1
+    bne INCY, TEMP, .L22
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L110
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L112 // C!=0 S==0
+    b .L111 // C!=0 S!=0
+    .align 3
+
+.L110:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L114 // C==0 S==0
+    b .L113 // C==0 S!=0
+    .align 3
+
+.L111: // C!=0 S!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, x1, VXC
+    vfmadd.d VX0, x3, VXS, VX0
+    vfmul.d VX1, x1, VXS
+    vfmsub.d VX1, x3, VXC, VX1
+    vfmul.d VX2, x2, VXC
+    vfmadd.d VX2, x4, VXS, VX2
+    vfmul.d VX3, x2, VXS
+    vfmsub.d VX3, x4, VXC, VX3
+    vilvl.d x1, VX2 ,VX0
+    vilvh.d x2, VX2, VX0
+    vilvl.d x3, VX3 ,VX1
+    vilvh.d x4, VX3, VX1
+    vst x1, X, 0 * SIZE
+    vst x3, Y, 0 * SIZE
+    vst x2, X, 2 * SIZE
+    vst x4, Y, 2 * SIZE
+    addi.d X, X, 4 * SIZE
+    addi.d Y, Y, 4 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmul.s VX0, x1, VXC
+    vfmadd.s VX0, x3, VXS, VX0
+    vfmul.s VX1, x1, VXS
+    vfmsub.s VX1, x3, VXC, VX1
+    vfmul.s VX2, x2, VXC
+    vfmadd.s VX2, x4, VXS, VX2
+    vfmul.s VX3, x2, VXS
+    vfmsub.s VX3, x4, VXC, VX3
+    vilvl.w x1, VX2 ,VX0
+    vilvh.w x2, VX2, VX0
+    vilvl.w x3, VX3 ,VX1
+    vilvh.w x4, VX3, VX1
+    vst x1, X, 0 * SIZE
+    vst x3, Y, 0 * SIZE
+    vst x2, X, 4 * SIZE
+    vst x4, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // C!=0 S==0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, x1, VXC
+    vfmul.d VX1, x3, VXC
+    vfmul.d VX2, x2, VXC
+    vfmul.d VX3, x4, VXC
+    vilvl.d x1, VX2 ,VX0
+    vilvh.d x2, VX2, VX0
+    vilvl.d x3, VX3 ,VX1
+    vilvh.d x4, VX3, VX1
+    vst x1, X, 0 * SIZE
+    vst x3, Y, 0 * SIZE
+    vst x2, X, 2 * SIZE
+    vst x4, Y, 2 * SIZE
+    addi.d X, X, 4 * SIZE
+    addi.d Y, Y, 4 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmul.s VX0, x1, VXC
+    vfmul.s VX1, x3, VXC
+    vfmul.s VX2, x2, VXC
+    vfmul.s VX3, x4, VXC
+    vilvl.w x1, VX2 ,VX0
+    vilvh.w x2, VX2, VX0
+    vilvl.w x3, VX3 ,VX1
+    vilvh.w x4, VX3, VX1
+    vst x1, X, 0 * SIZE
+    vst x3, Y, 0 * SIZE
+    vst x2, X, 4 * SIZE
+    vst x4, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // C==0 S!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, x3, VXS
+    vfmul.d VX1, x1, VXS
+    vfsub.d VX1, VXZ, VX1
+    vfmul.d VX2, x4, VXS
+    vfmul.d VX3, x2, VXS
+    vfsub.d VX3, VXZ, VX3
+    vilvl.d x1, VX2 ,VX0
+    vilvh.d x2, VX2, VX0
+    vilvl.d x3, VX3 ,VX1
+    vilvh.d x4, VX3, VX1
+    vst x1, X, 0 * SIZE
+    vst x3, Y, 0 * SIZE
+    vst x2, X, 2 * SIZE
+    vst x4, Y, 2 * SIZE
+    addi.d X, X, 4 * SIZE
+    addi.d Y, Y, 4 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmul.s VX0, x3, VXS
+    vfmul.s VX1, x1, VXS
+    vfsub.s VX1, VXZ, VX1
+    vfmul.s VX2, x4, VXS
+    vfmul.s VX3, x2, VXS
+    vfsub.s VX3, VXZ, VX3
+    vilvl.w x1, VX2 ,VX0
+    vilvh.w x2, VX2, VX0
+    vilvl.w x3, VX3 ,VX1
+    vilvh.w x4, VX3, VX1
+    vst x1, X, 0 * SIZE
+    vst x3, Y, 0 * SIZE
+    vst x2, X, 4 * SIZE
+    vst x4, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // C==0 S==0
+    vst VXZ, X, 0 * SIZE
+    vst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    vst VXZ, X, 2 * SIZE
+    vst VXZ, Y, 2 * SIZE
+    addi.d X, X, 4 * SIZE
+    addi.d Y, Y, 4 * SIZE
+#else
+    vst VXZ, X, 4 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    srai.d I, N, 2
+#endif
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L220
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L222 // C!=0 S==0
+    b .L221 // C!=0 S!=0
+    .align 3
+
+.L220:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L224 // C==0 S==0
+    b .L223 // C==0 S!=0
+    .align 3
+
+.L221: // C!=0 S!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, x1, VXC
+    vfmadd.d VX0, x3, VXS, VX0
+    vfmul.d VX1, x1, VXS
+    vfmsub.d VX1, x3, VXC, VX1
+    vfmul.d VX2, x2, VXC
+    vfmadd.d VX2, x4, VXS, VX2
+    vfmul.d VX3, x2, VXS
+    vfmsub.d VX3, x4, VXC, VX3
+    vstelm.d VX0, XX, 0, 0
+    vstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VX0, XX, 0, 1
+    vstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.d VX1, YY, 0, 0
+    vstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    vstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, x1, VXC
+    vfmadd.d VX0, x3, VXS, VX0
+    vfmul.d VX1, x1, VXS
+    vfmsub.d VX1, x3, VXC, VX1
+    vfmul.d VX2, x2, VXC
+    vfmadd.d VX2, x4, VXS, VX2
+    vfmul.d VX3, x2, VXS
+    vfmsub.d VX3, x4, VXC, VX3
+    vstelm.d VX0, XX, 0, 0
+    vstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VX0, XX, 0, 1
+    vstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.d VX1, YY, 0, 0
+    vstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    vstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L995
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+
+    vfmul.s VX0, x1, VXC
+    vfmadd.s VX0, x3, VXS, VX0
+    vfmul.s VX1, x1, VXS
+    vfmsub.s VX1, x3, VXC, VX1
+    vfmul.s VX2, x2, VXC
+    vfmadd.s VX2, x4, VXS, VX2
+    vfmul.s VX3, x2, VXS
+    vfmsub.s VX3, x4, VXC, VX3
+    vstelm.w VX0, XX, 0, 0
+    vstelm.w VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 1
+    vstelm.w VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 2
+    vstelm.w VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 3
+    vstelm.w VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    vstelm.w VX1, YY, 0, 0
+    vstelm.w VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    vstelm.w VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    vstelm.w VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+    vstelm.w VX3, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+#endif
+    .align 3
+
+.L222: // C!=0 S==0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, x1, VXC
+    vfmul.d VX1, x3, VXC
+    vfmul.d VX2, x2, VXC
+    vfmul.d VX3, x4, VXC
+    vstelm.d VX0, XX, 0, 0
+    vstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VX0, XX, 0, 1
+    vstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.d VX1, YY, 0, 0
+    vstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    vstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, x1, VXC
+    vfmul.d VX1, x3, VXC
+    vfmul.d VX2, x2, VXC
+    vfmul.d VX3, x4, VXC
+    vstelm.d VX0, XX, 0, 0
+    vstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VX0, XX, 0, 1
+    vstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.d VX1, YY, 0, 0
+    vstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    vstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    b .L995
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, x1, VXC
+    vfmul.s VX1, x3, VXC
+    vfmul.s VX2, x2, VXC
+    vfmul.s VX3, x4, VXC
+    vstelm.w VX0, XX, 0, 0
+    vstelm.w VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 1
+    vstelm.w VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 2
+    vstelm.w VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 3
+    vstelm.w VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    vstelm.w VX1, YY, 0, 0
+    vstelm.w VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    vstelm.w VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    vstelm.w VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+    vstelm.w VX3, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    b .L997
+#endif
+    .align 3
+
+.L223: // C==0 S!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, x3, VXS
+    vfmul.d VX1, x1, VXS
+    vfsub.d VX1, VXZ, VX1
+    vfmul.d VX2, x4, VXS
+    vfmul.d VX3, x2, VXS
+    vfsub.d VX3, VXZ, VX3
+    vstelm.d VX0, XX, 0, 0
+    vstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VX0, XX, 0, 1
+    vstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.d VX1, YY, 0, 0
+    vstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    vstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, x3, VXS
+    vfmul.d VX1, x1, VXS
+    vfsub.d VX1, VXZ, VX1
+    vfmul.d VX2, x4, VXS
+    vfmul.d VX3, x2, VXS
+    vfsub.d VX3, VXZ, VX3
+    vstelm.d VX0, XX, 0, 0
+    vstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VX0, XX, 0, 1
+    vstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.d VX1, YY, 0, 0
+    vstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    vstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L223
+    b .L995
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, x3, VXS
+    vfmul.s VX1, x1, VXS
+    vfsub.s VX1, VXZ, VX1
+    vfmul.s VX2, x4, VXS
+    vfmul.s VX3, x2, VXS
+    vfsub.s VX3, VXZ, VX3
+    vstelm.w VX0, XX, 0, 0
+    vstelm.w VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 1
+    vstelm.w VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 2
+    vstelm.w VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 3
+    vstelm.w VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    vstelm.w VX1, YY, 0, 0
+    vstelm.w VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    vstelm.w VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    vstelm.w VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+    vstelm.w VX3, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L223
+    b .L997
+#endif
+    .align 3
+
+.L224: // C==0 S==0
+#ifdef DOUBLE
+    vstelm.d VXZ, XX, 0, 0
+    vstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    vstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    vstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    vstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, YY, 0, 0
+    vstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    vstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    vstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    vstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    move X, XX
+    move Y, YY
+    b .L995
+#else
+    vstelm.w VXZ, XX, 0, 0
+    vstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 0
+    vstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 0
+    vstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 0
+    vstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, YY, 0, 0
+    vstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    vstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    vstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    vstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    move X, XX
+    move Y, YY
+    b .L997
+#endif
+    .align 3
+
+#ifdef DOUBLE
+    .L995:
+    andi I, N, 3
+    bge $r0, I, .L999
+    b .L998
+    .align 3
+
+#endif
+.L996:
+    move I, N
+    b .L998
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 1
+#else
+    andi I, N, 3
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    LD   a3, Y, 0 * SIZE
+    LD   a4, Y, 1 * SIZE
+    MUL  s1, a1, C
+    MADD s1, a3, S, s1
+    MUL  s2, a1, S
+    MSUB s2, a3, C, s2
+    MUL  s3, a2, C
+    MADD s3, a4, S, s3
+    MUL  s4, a2, S
+    MSUB s4, a4, C, s4
+    addi.d I, I, -1
+    ST   s1, X, 0 * SIZE
+    ST   s2, Y, 0 * SIZE
+    ST   s3, X, 1 * SIZE
+    ST   s4, Y, 1 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S
new file mode 100644
index 000000000..3605a6c0e
--- /dev/null
+++ b/kernel/loongarch64/cscal_lasx.S
@@ -0,0 +1,645 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r7
+#define INCX   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VXAR   $xr23
+#define VXAI   $xr19
+#define VXZ    $xr12
+#define x1     $xr18
+#define x2     $xr17
+#define x3     $xr16
+#define x4     $xr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT    a1, a1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    MTG  t1, ALPHAR
+#ifdef DOUBLE
+    xvreplgr2vr.d VXAR, t1
+    movfr2gr.d t2, ALPHAI
+    xvreplgr2vr.d VXAI, t2
+    xvxor.v VXZ, VXZ, VXZ
+    srai.d I, N, 2
+#else
+    xvreplgr2vr.w VXAR, t1
+    movfr2gr.s t2, ALPHAI
+    xvreplgr2vr.w VXAI, t2
+    xvxor.v VXZ, VXZ, VXZ
+    srai.d I, N, 3
+#endif
+    bne INCX, TEMP, .L22
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L13
+    b .L14
+    .align 3
+
+.L13:
+    bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
+    b .L113 //alpha_r != 0.0 && alpha_i == 0.0
+
+.L14:
+    bceqz $fcc1, .L112  //alpha_r == 0.0 && alpha_i != 0.0
+    b .L111 //alpha_r == 0.0 && alpha_i == 0.0
+    .align 3
+
+.L111:  //alpha_r == 0.0 && alpha_i == 0.0
+    xvst VXZ, X, 0 * SIZE
+#ifdef DOUBLE
+    xvst VXZ, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    xvst VXZ, X, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112:  //alpha_r == 0.0 && alpha_i != 0.0
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmul.d x3, VXAI, x2
+    xvfsub.d x3, VXZ, x3
+    xvfmul.d x4, VXAI, x1
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s x3, VXAI, x2
+    xvfsub.s x3, VXZ, x3
+    xvfmul.s x4, VXAI, x1
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: //alpha_r != 0.0 && alpha_i == 0.0
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmul.d x3, VXAR, x1
+    xvfmul.d x4, VXAR, x2
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s x3, VXAR, x1
+    xvfmul.s x4, VXAR, x2
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114:  //alpha_r != 0.0 && alpha_i != 0.0
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmul.d VX0, VXAI, x2
+    xvfmsub.d x3, VXAR, x1, VX0
+    xvfmul.d VX1, VXAI, x1
+    xvfmadd.d x4, VXAR, x2, VX1
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s VX0, VXAI, x2
+    xvfmsub.s x3, VXAR, x1, VX0
+    xvfmul.s VX1, VXAI, x1
+    xvfmadd.s x4, VXAR, x2, VX1
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move XX, X
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L23
+    b .L24
+    .align 3
+
+.L23:
+    bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
+    b .L223 //alpha_r != 0.0 && alpha_i == 0.0
+
+.L24:
+    bceqz $fcc1, .L222  //alpha_r == 0.0 && alpha_i != 0.0
+    b .L221 //alpha_r == 0.0 && alpha_i == 0.0
+    .align 3
+
+.L221:  //alpha_r == 0.0 && alpha_i == 0.0
+#ifdef DOUBLE
+    xvstelm.d VXZ, X, 0, 0
+    xvstelm.d VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.d VXZ, X, 0, 0
+    xvstelm.d VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.d VXZ, X, 0, 0
+    xvstelm.d VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.d VXZ, X, 0, 0
+    xvstelm.d VXZ, X, 1 * SIZE, 0
+#else
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+#endif
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222:  //alpha_r == 0.0 && alpha_i != 0.0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+
+    xvfmul.d x3, VXAI, x2
+    xvfsub.d x3, VXZ, x3
+    xvfmul.d x4, VXAI, x1
+    addi.d  I, I, -1
+    xvstelm.d x3, XX, 0 * SIZE, 0
+    xvstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 1
+    xvstelm.d x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 2
+    xvstelm.d x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 3
+    xvstelm.d x4, XX, 1 * SIZE, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+
+    xvfmul.s x3, VXAI, x2
+    xvfsub.s x3, VXZ, x3
+    xvfmul.s x4, VXAI, x1
+    addi.d  I, I, -1
+    xvstelm.w x3, XX, 0 * SIZE, 0
+    xvstelm.w x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 1
+    xvstelm.w x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 2
+    xvstelm.w x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 3
+    xvstelm.w x4, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 4
+    xvstelm.w x4, XX, 1 * SIZE, 4
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 5
+    xvstelm.w x4, XX, 1 * SIZE, 5
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 6
+    xvstelm.w x4, XX, 1 * SIZE, 6
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 7
+    xvstelm.w x4, XX, 1 * SIZE, 7
+#endif
+    add.d XX, XX, INCX
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: //alpha_r != 0.0 && alpha_i == 0.0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+
+    xvfmul.d x3, VXAR, x1
+    xvfmul.d x4, VXAR, x2
+    addi.d  I, I, -1
+    xvstelm.d x3, XX, 0 * SIZE, 0
+    xvstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 1
+    xvstelm.d x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 2
+    xvstelm.d x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 3
+    xvstelm.d x4, XX, 1 * SIZE, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+
+    xvfmul.s x3, VXAR, x1
+    xvfmul.s x4, VXAR, x2
+    addi.d  I, I, -1
+    xvstelm.w x3, XX, 0 * SIZE, 0
+    xvstelm.w x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 1
+    xvstelm.w x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 2
+    xvstelm.w x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 3
+    xvstelm.w x4, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 4
+    xvstelm.w x4, XX, 1 * SIZE, 4
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 5
+    xvstelm.w x4, XX, 1 * SIZE, 5
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 6
+    xvstelm.w x4, XX, 1 * SIZE, 6
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 7
+    xvstelm.w x4, XX, 1 * SIZE, 7
+#endif
+    add.d XX, XX, INCX
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224:  //alpha_r != 0.0 && alpha_i != 0.0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+
+    xvfmul.d VX0, VXAI, x2
+    xvfmsub.d x3, VXAR, x1, VX0
+    xvfmul.d VX1, VXAI, x1
+    xvfmadd.d x4, VXAR, x2, VX1
+    addi.d  I, I, -1
+    xvstelm.d x3, XX, 0 * SIZE, 0
+    xvstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 1
+    xvstelm.d x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 2
+    xvstelm.d x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 3
+    xvstelm.d x4, XX, 1 * SIZE, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+
+    xvfmul.s VX0, VXAI, x2
+    xvfmsub.s x3, VXAR, x1, VX0
+    xvfmul.s VX1, VXAI, x1
+    xvfmadd.s x4, VXAR, x2, VX1
+    addi.d  I, I, -1
+    xvstelm.w x3, XX, 0 * SIZE, 0
+    xvstelm.w x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 1
+    xvstelm.w x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 2
+    xvstelm.w x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 3
+    xvstelm.w x4, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 4
+    xvstelm.w x4, XX, 1 * SIZE, 4
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 5
+    xvstelm.w x4, XX, 1 * SIZE, 5
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 6
+    xvstelm.w x4, XX, 1 * SIZE, 6
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 7
+    xvstelm.w x4, XX, 1 * SIZE, 7
+#endif
+    add.d XX, XX, INCX
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 3
+#else
+    andi I, N, 7
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    addi.d I, I, -1
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MSUB s1, ALPHAR, a1, s1
+    MADD s2, ALPHAR, a2, s2
+    ST   s1, X, 0 * SIZE
+    ST   s2, X, 1 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cscal_lsx.S b/kernel/loongarch64/cscal_lsx.S
new file mode 100644
index 000000000..f442a754f
--- /dev/null
+++ b/kernel/loongarch64/cscal_lsx.S
@@ -0,0 +1,571 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r7
+#define INCX   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VXAR   $vr23
+#define VXAI   $vr19
+#define VXZ    $vr12
+#define x1     $vr18
+#define x2     $vr17
+#define x3     $vr16
+#define x4     $vr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT    a1, a1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    MTG  t1, ALPHAR
+#ifdef DOUBLE
+    vreplgr2vr.d VXAR, t1
+    movfr2gr.d t2, ALPHAI
+    vreplgr2vr.d VXAI, t2
+#else
+    vreplgr2vr.w VXAR, t1
+    movfr2gr.s t2, ALPHAI
+    vreplgr2vr.w VXAI, t2
+#endif
+    vxor.v VXZ, VXZ, VXZ
+    srai.d I, N, 2
+    bne INCX, TEMP, .L22
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L13
+    b .L14
+    .align 3
+
+.L13:
+    bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
+    b .L113 //alpha_r != 0.0 && alpha_i == 0.0
+
+.L14:
+    bceqz $fcc1, .L112  //alpha_r == 0.0 && alpha_i != 0.0
+    b .L111 //alpha_r == 0.0 && alpha_i == 0.0
+    .align 3
+
+.L111:  //alpha_r == 0.0 && alpha_i == 0.0
+    vst VXZ, X, 0 * SIZE
+#ifdef DOUBLE
+    vst VXZ, X, 2 * SIZE
+    vst VXZ, X, 4 * SIZE
+    vst VXZ, X, 6 * SIZE
+#else
+    vst VXZ, X, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112:  //alpha_r == 0.0 && alpha_i != 0.0
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXAI, x2
+    vfsub.d x3, VXZ, x3
+    vfmul.d x4, VXAI, x1
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXAI, x2
+    vfsub.d x3, VXZ, x3
+    vfmul.d x4, VXAI, x1
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, X, 4 * SIZE
+    vst VX3, X, 6 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, VXAI, x2
+    vfsub.s x3, VXZ, x3
+    vfmul.s x4, VXAI, x1
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: //alpha_r != 0.0 && alpha_i == 0.0
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXAR, x1
+    vfmul.d x4, VXAR, x2
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXAR, x1
+    vfmul.d x4, VXAR, x2
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, X, 4 * SIZE
+    vst VX3, X, 6 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, VXAR, x1
+    vfmul.s x4, VXAR, x2
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114:  //alpha_r != 0.0 && alpha_i != 0.0
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d VX0, VXAI, x2
+    vfmsub.d x3, VXAR, x1, VX0
+    vfmul.d VX1, VXAI, x1
+    vfmadd.d x4, VXAR, x2, VX1
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d VX0, VXAI, x2
+    vfmsub.d x3, VXAR, x1, VX0
+    vfmul.d VX1, VXAI, x1
+    vfmadd.d x4, VXAR, x2, VX1
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, X, 4 * SIZE
+    vst VX3, X, 6 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s VX0, VXAI, x2
+    vfmsub.s x3, VXAR, x1, VX0
+    vfmul.s VX1, VXAI, x1
+    vfmadd.s x4, VXAR, x2, VX1
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move XX, X
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L23
+    b .L24
+    .align 3
+
+.L23:
+    bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
+    b .L223 //alpha_r != 0.0 && alpha_i == 0.0
+
+.L24:
+    bceqz $fcc1, .L222  //alpha_r == 0.0 && alpha_i != 0.0
+    b .L221 //alpha_r == 0.0 && alpha_i == 0.0
+    .align 3
+
+.L221:  //alpha_r == 0.0 && alpha_i == 0.0
+#ifdef DOUBLE
+    vstelm.d VXZ, X, 0, 0
+    vstelm.d VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    vstelm.d VXZ, X, 0, 0
+    vstelm.d VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    vstelm.d VXZ, X, 0, 0
+    vstelm.d VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    vstelm.d VXZ, X, 0, 0
+    vstelm.d VXZ, X, 1 * SIZE, 0
+#else
+    vstelm.w VXZ, X, 0, 0
+    vstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    vstelm.w VXZ, X, 0, 0
+    vstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    vstelm.w VXZ, X, 0, 0
+    vstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    vstelm.w VXZ, X, 0, 0
+    vstelm.w VXZ, X, 1 * SIZE, 0
+#endif
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222:  //alpha_r == 0.0 && alpha_i != 0.0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vfmul.d x3, VXAI, x2
+    vfsub.d x3, VXZ, x3
+    vfmul.d x4, VXAI, x1
+    vstelm.d x3, XX, 0 * SIZE, 0
+    vstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d x3, XX, 0 * SIZE, 1
+    vstelm.d x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vfmul.d x3, VXAI, x2
+    vfsub.d x3, VXZ, x3
+    vfmul.d x4, VXAI, x1
+    addi.d  I, I, -1
+    vstelm.d x3, XX, 0 * SIZE, 0
+    vstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d x3, XX, 0 * SIZE, 1
+    vstelm.d x4, XX, 1 * SIZE, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+
+    vfmul.s x3, VXAI, x2
+    vfsub.s x3, VXZ, x3
+    vfmul.s x4, VXAI, x1
+    addi.d  I, I, -1
+    vstelm.w x3, XX, 0 * SIZE, 0
+    vstelm.w x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 1
+    vstelm.w x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 2
+    vstelm.w x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 3
+    vstelm.w x4, XX, 1 * SIZE, 3
+#endif
+    add.d XX, XX, INCX
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: //alpha_r != 0.0 && alpha_i == 0.0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vfmul.d x3, VXAR, x1
+    vfmul.d x4, VXAR, x2
+    vstelm.d x3, XX, 0 * SIZE, 0
+    vstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d x3, XX, 0 * SIZE, 1
+    vstelm.d x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vfmul.d x3, VXAR, x1
+    vfmul.d x4, VXAR, x2
+    addi.d  I, I, -1
+    vstelm.d x3, XX, 0 * SIZE, 0
+    vstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d x3, XX, 0 * SIZE, 1
+    vstelm.d x4, XX, 1 * SIZE, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+
+    vfmul.s x3, VXAR, x1
+    vfmul.s x4, VXAR, x2
+    addi.d  I, I, -1
+    vstelm.w x3, XX, 0 * SIZE, 0
+    vstelm.w x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 1
+    vstelm.w x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 2
+    vstelm.w x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 3
+    vstelm.w x4, XX, 1 * SIZE, 3
+#endif
+    add.d XX, XX, INCX
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224:  //alpha_r != 0.0 && alpha_i != 0.0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vfmul.d VX0, VXAI, x2
+    vfmsub.d x3, VXAR, x1, VX0
+    vfmul.d VX1, VXAI, x1
+    vfmadd.d x4, VXAR, x2, VX1
+    vstelm.d x3, XX, 0 * SIZE, 0
+    vstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d x3, XX, 0 * SIZE, 1
+    vstelm.d x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vfmul.d VX0, VXAI, x2
+    vfmsub.d x3, VXAR, x1, VX0
+    vfmul.d VX1, VXAI, x1
+    vfmadd.d x4, VXAR, x2, VX1
+    addi.d  I, I, -1
+    vstelm.d x3, XX, 0 * SIZE, 0
+    vstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d x3, XX, 0 * SIZE, 1
+    vstelm.d x4, XX, 1 * SIZE, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+
+    vfmul.s VX0, VXAI, x2
+    vfmsub.s x3, VXAR, x1, VX0
+    vfmul.s VX1, VXAI, x1
+    vfmadd.s x4, VXAR, x2, VX1
+    addi.d  I, I, -1
+    vstelm.w x3, XX, 0 * SIZE, 0
+    vstelm.w x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 1
+    vstelm.w x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 2
+    vstelm.w x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 3
+    vstelm.w x4, XX, 1 * SIZE, 3
+#endif
+    add.d XX, XX, INCX
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    addi.d I, I, -1
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MSUB s1, ALPHAR, a1, s1
+    MADD s2, ALPHAR, a2, s2
+    ST   s1, X, 0 * SIZE
+    ST   s2, X, 1 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/csum_lasx.S b/kernel/loongarch64/csum_lasx.S
new file mode 100644
index 000000000..3e65f2c15
--- /dev/null
+++ b/kernel/loongarch64/csum_lasx.S
@@ -0,0 +1,274 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define res1   $xr16
+#define res2   $xr17
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d  TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+    xvld VX2, X, 8 * SIZE
+    xvld VX3, X, 12 * SIZE
+    xvfadd.d res2, VX2, VX3
+    xvfadd.d res1, res1, res2
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    xvfadd.s res2, VX0, VX1
+    xvfadd.s res1, res2, res1
+#endif
+    addi.d X, X, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    ADD a1, a1, a2
+    ADD s1, a1, s1
+
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 0
+    xvinsgr2vr.w VX1, t2, 1
+    xvinsgr2vr.w VX1, t3, 2
+    xvinsgr2vr.w VX1, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 4
+    xvinsgr2vr.w VX1, t2, 5
+    xvinsgr2vr.w VX1, t3, 6
+    xvinsgr2vr.w VX1, t4, 7
+    xvfadd.s res2, VX0, VX1
+    xvfadd.s res1, res2, res1
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    ADD a1, a1, a2
+    ADD s1, a1, s1
+
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/csum_lsx.S b/kernel/loongarch64/csum_lsx.S
new file mode 100644
index 000000000..8de8e27ca
--- /dev/null
+++ b/kernel/loongarch64/csum_lsx.S
@@ -0,0 +1,266 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define res1   $vr16
+#define res2   $vr17
+#define res3   $vr18
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d  TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX2, X, 4 * SIZE
+    vld VX3, X, 6 * SIZE
+    vfadd.d res2, VX2, VX3
+    vfadd.d res1, res1, res2
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 10 * SIZE
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX2, X, 12 * SIZE
+    vld VX3, X, 14 * SIZE
+    vfadd.d res2, VX2, VX3
+    vfadd.d res1, res1, res2
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vfadd.s res2, VX0, VX1
+    vld VX2, X, 8 * SIZE
+    vld VX3, X, 12 * SIZE
+    vfadd.s res3, VX2, VX3
+    vfadd.s res2, res3, res2
+    vfadd.s res1, res1, res2
+#endif
+
+    addi.d  I, I, -1
+    addi.d X, X, 16 * SIZE
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    ADD a1, a1, a2
+    ADD s1, a1, s1
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfadd.s res2, VX0, VX1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vfadd.s res3, VX2, VX3
+    vfadd.s res2, res3, res2
+    vfadd.s res1, res1, res2
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    ADD a1, a1, a2
+    ADD s1, a1, s1
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cswap_lasx.S b/kernel/loongarch64/cswap_lasx.S
new file mode 100644
index 000000000..d53773d5a
--- /dev/null
+++ b/kernel/loongarch64/cswap_lasx.S
@@ -0,0 +1,394 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 4 * SIZE
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 4 * SIZE
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvst VX2, X, 0 * SIZE
+    xvst VX0, Y, 0 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+.L113:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    xvstelm.d VX0, Y, 0 * SIZE, 0
+    ld.d t2, Y, 1 * SIZE
+    xvstelm.d VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    xvstelm.d VX0, Y, 0 * SIZE, 2
+    ld.d t4, Y, 1 * SIZE
+    xvstelm.d VX0, Y, 1 * SIZE, 3
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvst VX2, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    xvstelm.d VX1, Y, 0 * SIZE, 0
+    ld.d t2, Y, 1 * SIZE
+    xvstelm.d VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    xvstelm.d VX1, Y, 0 * SIZE, 2
+    ld.d t4, Y, 1 * SIZE
+    xvstelm.d VX1, Y, 1 * SIZE, 3
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvst VX3, X, 4 * SIZE
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0 * SIZE, 0
+    ld.w t2, Y, 1 * SIZE
+    xvstelm.w VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0 * SIZE, 2
+    ld.w t4, Y, 1 * SIZE
+    xvstelm.w VX0, Y, 1 * SIZE, 3
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0 * SIZE, 4
+    ld.w t2, Y, 1 * SIZE
+    xvstelm.w VX0, Y, 1 * SIZE, 5
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0 * SIZE, 6
+    ld.w t4, Y, 1 * SIZE
+    xvstelm.w VX0, Y, 1 * SIZE, 7
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvst VX2, X, 0 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    xvstelm.d VX2, X, 0 * SIZE, 0
+    ld.d t2, X, 1 * SIZE
+    xvstelm.d VX2, X, 1 * SIZE, 1
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    xvstelm.d VX2, X, 0 * SIZE, 2
+    ld.d t4, X, 1 * SIZE
+    xvstelm.d VX2, X, 1 * SIZE, 3
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvst VX0, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    xvstelm.d VX3, X, 0 * SIZE, 0
+    ld.d t2, X, 1 * SIZE
+    xvstelm.d VX3, X, 1 * SIZE, 1
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    xvstelm.d VX3, X, 0 * SIZE, 2
+    ld.d t4, X, 1 * SIZE
+    xvstelm.d VX3, X, 1 * SIZE, 3
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvst VX1, Y, 4 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    xvstelm.w VX2, X, 0 * SIZE, 0
+    ld.w t2, X, 1 * SIZE
+    xvstelm.w VX2, X, 1 * SIZE, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    xvstelm.w VX2, X, 0 * SIZE, 2
+    ld.w t4, X, 1 * SIZE
+    xvstelm.w VX2, X, 1 * SIZE, 3
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    xvstelm.w VX2, X, 0 * SIZE, 4
+    ld.w t2, X, 1 * SIZE
+    xvstelm.w VX2, X, 1 * SIZE, 5
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    xvstelm.w VX2, X, 0 * SIZE, 6
+    ld.w t4, X, 1 * SIZE
+    xvstelm.w VX2, X, 1 * SIZE, 7
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvst VX0, Y, 0 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+    move XX, X
+
+.L222:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD a3, X, 0 * SIZE
+    LD a4, X, 1 * SIZE
+    add.d X, X, INCX
+    LD b1, Y, 0 * SIZE
+    ST a1, Y, 0 * SIZE
+    LD b2, Y, 1 * SIZE
+    ST a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    LD b3, Y, 0 * SIZE
+    ST a3, Y, 0 * SIZE
+    LD b4, Y, 1 * SIZE
+    ST a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD a1, X, 0 * SIZE
+    ST b1, XX, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    add.d X, X, INCX
+    ST b2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    LD a3, X, 0 * SIZE
+    ST b3, XX, 0 * SIZE
+    LD a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST b4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+
+    LD b1, Y, 0 * SIZE
+    ST a1, Y, 0 * SIZE
+    LD b2, Y, 1 * SIZE
+    ST a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    LD b3, Y, 0 * SIZE
+    ST a3, Y, 0 * SIZE
+    LD b4, Y, 1 * SIZE
+    ST a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    ST b1, XX, 0 * SIZE
+    ST b2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    ST b3, XX, 0 * SIZE
+    ST b4, XX, 1 * SIZE
+
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cswap_lsx.S b/kernel/loongarch64/cswap_lsx.S
new file mode 100644
index 000000000..62a869066
--- /dev/null
+++ b/kernel/loongarch64/cswap_lsx.S
@@ -0,0 +1,421 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 2 * SIZE
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vst VX2, X, 4 * SIZE
+    vst VX3, X, 6 * SIZE
+    vst VX0, Y, 4 * SIZE
+    vst VX1, Y, 6 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 4 * SIZE
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 4 * SIZE
+#endif
+    addi.d  I, I, -1
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    vstelm.d VX0, Y, 0 * SIZE, 0
+    ld.d t2, Y, 1 * SIZE
+    vstelm.d VX0, Y, 1 * SIZE, 1
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vst VX2, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    vstelm.d VX1, Y, 0 * SIZE, 0
+    ld.d t4, Y, 1 * SIZE
+    vstelm.d VX1, Y, 1 * SIZE, 1
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vst VX3, X, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    vstelm.d VX0, Y, 0 * SIZE, 0
+    ld.d t2, Y, 1 * SIZE
+    vstelm.d VX0, Y, 1 * SIZE, 1
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vst VX2, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    vstelm.d VX1, Y, 0 * SIZE, 0
+    ld.d t4, Y, 1 * SIZE
+    vstelm.d VX1, Y, 1 * SIZE, 1
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vst VX3, X, 6 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    vstelm.w VX0, Y, 0 * SIZE, 0
+    ld.w t2, Y, 1 * SIZE
+    vstelm.w VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    vstelm.w VX0, Y, 0 * SIZE, 2
+    ld.w t4, Y, 1 * SIZE
+    vstelm.w VX0, Y, 1 * SIZE, 3
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vst VX2, X, 0 * SIZE
+
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    vstelm.w VX1, Y, 0 * SIZE, 0
+    ld.w t2, Y, 1 * SIZE
+    vstelm.w VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    vstelm.w VX1, Y, 0 * SIZE, 2
+    ld.w t4, Y, 1 * SIZE
+    vstelm.w VX1, Y, 1 * SIZE, 3
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vst VX3, X, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    vstelm.d VX2, X, 0 * SIZE, 0
+    ld.d t2, X, 1 * SIZE
+    vstelm.d VX2, X, 1 * SIZE, 1
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vst VX0, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    ld.d t3, X, 0 * SIZE
+    vstelm.d VX3, X, 0 * SIZE, 0
+    ld.d t4, X, 1 * SIZE
+    vstelm.d VX3, X, 1 * SIZE, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vst VX1, Y, 2 * SIZE
+    vld VX2, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    vstelm.d VX2, X, 0 * SIZE, 0
+    ld.d t2, X, 1 * SIZE
+    vstelm.d VX2, X, 1 * SIZE, 1
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vst VX0, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0 * SIZE
+    vstelm.d VX3, X, 0 * SIZE, 0
+    ld.d t4, X, 1 * SIZE
+    vstelm.d VX3, X, 1 * SIZE, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vst VX1, Y, 6 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    vstelm.w VX2, X, 0 * SIZE, 0
+    ld.w t2, X, 1 * SIZE
+    vstelm.w VX2, X, 1 * SIZE, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    vstelm.w VX2, X, 0 * SIZE, 2
+    ld.w t4, X, 1 * SIZE
+    vstelm.w VX2, X, 1 * SIZE, 3
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vst VX0, Y, 0 * SIZE
+
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    vstelm.w VX3, X, 0 * SIZE, 0
+    ld.w t2, X, 1 * SIZE
+    vstelm.w VX3, X, 1 * SIZE, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    vstelm.w VX3, X, 0 * SIZE, 2
+    ld.w t4, X, 1 * SIZE
+    vstelm.w VX3, X, 1 * SIZE, 3
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    vst VX1, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+    move XX, X
+
+.L222:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD a3, X, 0 * SIZE
+    LD a4, X, 1 * SIZE
+    add.d X, X, INCX
+    LD b1, Y, 0 * SIZE
+    ST a1, Y, 0 * SIZE
+    LD b2, Y, 1 * SIZE
+    ST a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    LD b3, Y, 0 * SIZE
+    ST a3, Y, 0 * SIZE
+    LD b4, Y, 1 * SIZE
+    ST a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD a1, X, 0 * SIZE
+    ST b1, XX, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    add.d X, X, INCX
+    ST b2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    LD a3, X, 0 * SIZE
+    ST b3, XX, 0 * SIZE
+    LD a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST b4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+
+    LD b1, Y, 0 * SIZE
+    ST a1, Y, 0 * SIZE
+    LD b2, Y, 1 * SIZE
+    ST a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    LD b3, Y, 0 * SIZE
+    ST a3, Y, 0 * SIZE
+    LD b4, Y, 1 * SIZE
+    ST a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    ST b1, XX, 0 * SIZE
+    ST b2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    ST b3, XX, 0 * SIZE
+    ST b4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_kernel_8x4.S b/kernel/loongarch64/dgemm_kernel_8x4.S
new file mode 100644
index 000000000..405f1bd97
--- /dev/null
+++ b/kernel/loongarch64/dgemm_kernel_8x4.S
@@ -0,0 +1,2894 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA  $f0   // param 4: alpha
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+
+#ifdef TRMMKERNEL
+#define OFFSET $r11  // param 9: offset
+#endif
+#define OFF    $r12
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25 /* !! DO NOT USE $r21 and $r22 !! */
+#define T1     $r26
+#define T2     $r27
+#define ZERO   $r0
+
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+#define D0     $vr16
+#define D1     $vr17
+#define D2     $vr18
+#define D3     $vr19
+#define D4     $vr20
+#define D5     $vr21
+#define D6     $vr22
+#define D7     $vr23
+#define D8     $vr24
+#define D9     $vr25
+#define D10    $vr26
+#define D11    $vr27
+#define D12    $vr28
+#define D13    $vr29
+#define D14    $vr30
+#define D15    $vr31
+#define VALPHA $vr15
+
+/* Prefetch interval */
+#define A_PRE  0x200
+#define B_PRE  0x100
+
+.macro KERNEL2x8x4
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vld     U1,   A0,    0x10
+    vfmadd.d  D2,  U10, U12, D2
+    vfmadd.d  D3,  U11, U12, D3
+
+    vld     U2,   A0,    0x20
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vld     U3,   A0,    0x30
+    vfmadd.d  D6,  U10, U13, D6
+    vfmadd.d  D7,  U11, U13, D7
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D8,  U8, U14, D8
+    vfmadd.d  D9,  U9, U14, D9
+
+    preld     0,   B0, B_PRE
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D10, U10, U14, D10
+    vfmadd.d  D11, U11, U14, D11
+
+    preld     0,   A0, A_PRE
+    vldrepl.d U6,  B0, 0x10
+    vfmadd.d  D12, U8, U15, D12
+    vfmadd.d  D13, U9, U15, D13
+
+    preld     0,   A0, A_PRE + 0x40
+    vldrepl.d U7,  B0, 0x18
+    vfmadd.d  D14, U10, U15, D14
+    vfmadd.d  D15, U11, U15, D15
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    vld     U8,   A0,    0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vld     U9,   A0,    0x10
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vld     U10,  A0,    0x20
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vld     U11,  A0,    0x30
+    vfmadd.d  D6,  U2, U5, D6
+    vfmadd.d  D7,  U3, U5, D7
+
+    vldrepl.d U12,  B0, 0x00
+    vfmadd.d  D8,  U0, U6, D8
+    vfmadd.d  D9,  U1, U6, D9
+
+    preld     0,   B0, B_PRE
+    vldrepl.d U13,  B0, 0x08
+    vfmadd.d  D10, U2, U6, D10
+    vfmadd.d  D11, U3, U6, D11
+
+    preld     0,   A0, A_PRE
+    vldrepl.d U14,  B0, 0x10
+    vfmadd.d  D12, U0, U7, D12
+    vfmadd.d  D13, U1, U7, D13
+
+    preld     0,   A0, A_PRE + 0x40
+    vldrepl.d U15,  B0, 0x18
+    vfmadd.d  D14, U2, U7, D14
+    vfmadd.d  D15, U3, U7, D15
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x8x4_END
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vld     U1,   A0,    0x10
+    vfmadd.d  D2,  U10, U12, D2
+    vfmadd.d  D3,  U11, U12, D3
+
+    vld     U2,   A0,    0x20
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vld     U3,   A0,    0x30
+    vfmadd.d  D6,  U10, U13, D6
+    vfmadd.d  D7,  U11, U13, D7
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D8,  U8, U14, D8
+    vfmadd.d  D9,  U9, U14, D9
+
+    preld     0,   B0, B_PRE
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D10, U10, U14, D10
+    vfmadd.d  D11, U11, U14, D11
+
+    preld     0,   A0, A_PRE
+    vldrepl.d U6,  B0, 0x10
+    vfmadd.d  D12, U8, U15, D12
+    vfmadd.d  D13, U9, U15, D13
+
+    preld     0,   A0, A_PRE + 0x40
+    vldrepl.d U7,  B0, 0x18
+    vfmadd.d  D14, U10, U15, D14
+    vfmadd.d  D15, U11, U15, D15
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vfmadd.d  D6,  U2, U5, D6
+    vfmadd.d  D7,  U3, U5, D7
+
+    vfmadd.d  D8,  U0, U6, D8
+    vfmadd.d  D9,  U1, U6, D9
+
+    preld     0,   B0, B_PRE
+    vfmadd.d  D10, U2, U6, D10
+    vfmadd.d  D11, U3, U6, D11
+
+    preld     0,   A0, A_PRE
+    vfmadd.d  D12, U0, U7, D12
+    vfmadd.d  D13, U1, U7, D13
+
+    preld     0,   A0, A_PRE + 0x40
+    vfmadd.d  D14, U2, U7, D14
+    vfmadd.d  D15, U3, U7, D15
+.endm
+
+.macro KERNEL8x8x4
+.rept 4
+    KERNEL2x8x4
+.endr
+.endm
+
+.macro KERNEL8x8x4_END
+.rept 3
+    KERNEL2x8x4
+.endr
+    KERNEL2x8x4_END
+.endm
+
+.macro KERNEL2x4x4
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vldrepl.d U6,  B0, 0x10
+    vfmadd.d  D8,  U8, U14, D8
+    vfmadd.d  D9,  U9, U14, D9
+
+    vldrepl.d U7,  B0, 0x18
+    vfmadd.d  D12, U8, U15, D12
+    vfmadd.d  D13, U9, U15, D13
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+
+    vldrepl.d U12, B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vldrepl.d U13, B0, 0x08
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vldrepl.d U14, B0, 0x10
+    vfmadd.d  D8,  U0, U6, D8
+    vfmadd.d  D9,  U1, U6, D9
+
+    vldrepl.d U15,  B0, 0x18
+    vfmadd.d  D12, U0, U7, D12
+    vfmadd.d  D13, U1, U7, D13
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x4x4_END
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vldrepl.d U6,  B0, 0x10
+    vfmadd.d  D8,  U8, U14, D8
+    vfmadd.d  D9,  U9, U14, D9
+
+    vldrepl.d U7,  B0, 0x18
+    vfmadd.d  D12, U8, U15, D12
+    vfmadd.d  D13, U9, U15, D13
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+    vfmadd.d  D8,  U0, U6, D8
+    vfmadd.d  D9,  U1, U6, D9
+    vfmadd.d  D12, U0, U7, D12
+    vfmadd.d  D13, U1, U7, D13
+.endm
+
+.macro KERNEL8x4x4
+.rept 4
+    KERNEL2x4x4
+.endr
+.endm
+
+.macro KERNEL8x4x4_END
+.rept 3
+    KERNEL2x4x4
+.endr
+    KERNEL2x4x4_END
+.endm
+
+.macro KERNEL2x2x4
+    vldrepl.d     U0,   A0,    0x00
+    vldrepl.d     U1,   A0,    0x08
+
+    vfmadd.d   D0,     U8,     U12,    D0
+    vfmadd.d   D1,     U8,     U13,    D1
+    vfmadd.d   D2,     U9,     U12,    D2
+    vfmadd.d   D3,     U9,     U13,    D3
+
+    vld    U4,  B0, 0x00
+    vld    U5,  B0, 0x10
+    addi.d  A0,  A0, 0x10
+    addi.d  B0,  B0, 0x20
+
+    vldrepl.d     U8,   A0,    0x00
+    vldrepl.d     U9,   A0,    0x08
+
+    vfmadd.d   D0,     U0,     U4,    D0
+    vfmadd.d   D1,     U0,     U5,    D1
+    vfmadd.d   D2,     U1,     U4,    D2
+    vfmadd.d   D3,     U1,     U5,    D3
+
+    vld       U12, B0, 0x00
+    vld       U13, B0, 0x10
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x2x4_END
+    vldrepl.d     U0,   A0,    0x00
+    vldrepl.d     U1,   A0,    0x08
+
+    vfmadd.d   D0,     U8,     U12,    D0
+    vfmadd.d   D1,     U8,     U13,    D1
+    vfmadd.d   D2,     U9,     U12,    D2
+    vfmadd.d   D3,     U9,     U13,    D3
+
+    vld     U4,  B0, 0x00
+    vld     U5,  B0, 0x10
+    addi.d  A0,  A0, 0x10
+    addi.d  B0,  B0, 0x20
+
+    vfmadd.d   D0,     U0,     U4,    D0
+    vfmadd.d   D1,     U0,     U5,    D1
+    vfmadd.d   D2,     U1,     U4,    D2
+    vfmadd.d   D3,     U1,     U5,    D3
+.endm
+
+.macro KERNEL8x2x4
+.rept 4
+    KERNEL2x2x4
+.endr
+.endm
+
+.macro KERNEL8x2x4_END
+.rept 3
+    KERNEL2x2x4
+.endr
+    KERNEL2x2x4_END
+.endm
+
+.macro KERNEL2x1x4
+    vldrepl.d  U0,     A0,     0x00
+    vfmadd.d   D0,     U8,     U12,    D0
+    vfmadd.d   D1,     U8,     U13,    D1
+    vld        U4,     B0,     0x00
+    vld        U5,     B0,     0x10
+
+    vldrepl.d  U8,     A0,     0x08
+    vfmadd.d   D0,     U0,     U4,     D0
+    vfmadd.d   D1,     U0,     U5,     D1
+    vld        U12,    B0,     0x20
+    vld        U13,    B0,     0x30
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x40
+.endm
+
+.macro KERNEL2x1x4_END
+    vldrepl.d  U0,     A0,     0x00
+    vfmadd.d   D0,     U8,     U12,    D0
+    vfmadd.d   D1,     U8,     U13,    D1
+    vld        U4,     B0,     0x00
+    vld        U5,     B0,     0x10
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    vfmadd.d   D0,     U0,     U4,     D0
+    vfmadd.d   D1,     U0,     U5,     D1
+.endm
+
+.macro KERNEL8x1x4
+.rept 4
+    KERNEL2x1x4
+.endr
+.endm
+
+.macro KERNEL8x1x4_END
+.rept 3
+    KERNEL2x1x4
+.endr
+    KERNEL2x1x4_END
+.endm
+
+.macro KERNEL2x8x2
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vld     U1,   A0,    0x10
+    vfmadd.d  D2,  U10, U12, D2
+    vfmadd.d  D3,  U11, U12, D3
+
+    vld     U2,   A0,    0x20
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vld     U3,   A0,    0x30
+    vfmadd.d  D6,  U10, U13, D6
+    vfmadd.d  D7,  U11, U13, D7
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    vld     U8,   A0,    0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vld     U9,   A0,    0x10
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vld     U10,   A0,   0x20
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vld     U11,   A0,   0x30
+    vfmadd.d  D6,  U2, U5, D6
+    vfmadd.d  D7,  U3, U5, D7
+
+    vldrepl.d U12,  B0, 0x00
+    vldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x8x2_END
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vld     U1,   A0,    0x10
+    vfmadd.d  D2,  U10, U12, D2
+    vfmadd.d  D3,  U11, U12, D3
+
+    vld     U2,   A0,    0x20
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vld     U3,   A0,    0x30
+    vfmadd.d  D6,  U10, U13, D6
+    vfmadd.d  D7,  U11, U13, D7
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vfmadd.d  D6,  U2, U5, D6
+    vfmadd.d  D7,  U3, U5, D7
+.endm
+
+.macro KERNEL8x8x2
+.rept 4
+    KERNEL2x8x2
+.endr
+.endm
+
+.macro KERNEL8x8x2_END
+.rept 3
+    KERNEL2x8x2
+.endr
+    KERNEL2x8x2_END
+.endm
+
+.macro KERNEL2x4x2
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+
+    vld     U8,   A0,    0x20
+    vld     U9,   A0,    0x30
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vldrepl.d U12,  B0, 0x10
+    vldrepl.d U13,  B0, 0x18
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x4x2_END
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+.endm
+
+.macro KERNEL8x4x2
+.rept 4
+    KERNEL2x4x2
+.endr
+.endm
+
+.macro KERNEL8x4x2_END
+.rept 3
+    KERNEL2x4x2
+.endr
+    KERNEL2x4x2_END
+.endm
+
+.macro KERNEL2x2x2
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D4,  U8, U13, D4
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+
+    vld     U8,   A0,    0x10
+    vldrepl.d U12,  B0, 0x10
+    vldrepl.d U13,  B0, 0x18
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D4,  U0, U5, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x2x2_END
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D4,  U8, U13, D4
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D4,  U0, U5, D4
+.endm
+
+.macro KERNEL8x2x2
+.rept 4
+    KERNEL2x2x2
+.endr
+.endm
+
+.macro KERNEL8x2x2_END
+.rept 3
+    KERNEL2x2x2
+.endr
+    KERNEL2x2x2_END
+.endm
+
+.macro KERNEL2x1x2
+    vldrepl.d U0,  A0, 0x00
+    vfmadd.d  D0,  U8, U12, D0
+
+    vld       U4,  B0, 0x00
+    vldrepl.d U8,  A0, 0x08
+    vld       U12, B0, 0x10
+    vfmadd.d  D0,  U0, U4, D0
+
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x1x2_END
+    vldrepl.d U0,  A0, 0x00
+    vfmadd.d  D0,  U8, U12, D0
+
+    vld       U4,  B0, 0x00
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x10
+
+    vfmadd.d  D0,  U0, U4, D0
+.endm
+
+.macro KERNEL8x1x2
+.rept 4
+    KERNEL2x1x2
+.endr
+.endm
+
+.macro KERNEL8x1x2_END
+.rept 3
+    KERNEL2x1x2
+.endr
+    KERNEL2x1x2_END
+.endm
+
+.macro KERNEL2x8x1
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vld     U1,   A0,    0x10
+    vfmadd.d  D2,  U10, U12, D2
+    vfmadd.d  D3,  U11, U12, D3
+
+    vldrepl.d U4,  B0, 0x00
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vld     U8,   A0,    0x40
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vld     U9,   A0,    0x50
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vld     U10,   A0,    0x60
+    vld     U11,   A0,    0x70
+
+    vldrepl.d U12,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x8x1_END
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vld     U1,   A0,    0x10
+    vfmadd.d  D2,  U10, U12, D2
+    vfmadd.d  D3,  U11, U12, D3
+
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+.endm
+
+.macro KERNEL8x8x1
+.rept 4
+    KERNEL2x8x1
+.endr
+.endm
+
+.macro KERNEL8x8x1_END
+.rept 3
+    KERNEL2x8x1
+.endr
+    KERNEL2x8x1_END
+.endm
+
+.macro KERNEL2x4x1
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+    vldrepl.d U4,  B0, 0x00
+
+    vld     U8,   A0,    0x20
+    vld     U9,   A0,    0x30
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vldrepl.d U12,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x4x1_END
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+    vldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+.endm
+
+.macro KERNEL8x4x1
+.rept 4
+    KERNEL2x4x1
+.endr
+.endm
+
+.macro KERNEL8x4x1_END
+.rept 3
+    KERNEL2x4x1
+.endr
+    KERNEL2x4x1_END
+.endm
+
+.macro KERNEL2x2x1
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    vld     U8,   A0,    0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vldrepl.d U12,  B0, 0x00
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x2x1_END
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    vfmadd.d  D0,  U0, U4, D0
+.endm
+
+.macro KERNEL8x2x1
+.rept 4
+    KERNEL2x2x1
+.endr
+.endm
+
+.macro KERNEL8x2x1_END
+.rept 3
+    KERNEL2x2x1
+.endr
+    KERNEL2x2x1_END
+.endm
+
+.macro KERNEL2x1x1
+    vldrepl.d  U0,  A0,    0x00
+    vfmadd.d   D0,  U8, U12, D0
+    vldrepl.d  U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    vldrepl.d  U8,  A0,    0x00
+    vfmadd.d   D0,  U0, U4, D0
+    vldrepl.d  U12, B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x1x1_END
+    vldrepl.d  U0,  A0,    0x00
+    vfmadd.d   D0,  U8, U12, D0
+    vldrepl.d  U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    vfmadd.d  D0,  U0, U4, D0
+.endm
+
+.macro KERNEL8x1x1
+.rept 4
+    KERNEL2x1x1
+.endr
+.endm
+
+.macro KERNEL8x1x1_END
+.rept 3
+    KERNEL2x1x1
+.endr
+    KERNEL2x1x1_END
+.endm
+
+
+    PROLOGUE
+
+    addi.d   $sp,   $sp,   -112
+    /* Store regs */
+    SDARG    $r23,  $sp,   0
+    SDARG    $r24,  $sp,   8
+    SDARG    $r25,  $sp,   16
+    SDARG    $r26,  $sp,   24
+    SDARG    $r27,  $sp,   32
+    ST       $f24,  $sp,   40
+    ST       $f25,  $sp,   48
+    ST       $f26,  $sp,   56
+    ST       $f27,  $sp,   64
+    ST       $f28,  $sp,   72
+    ST       $f29,  $sp,   80
+    ST       $f30,  $sp,   88
+    ST       $f31,  $sp,   96
+    ST       ALPHA, $sp,   104
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d   OFF,   ZERO,  OFFSET
+#else
+    xor     OFF,   OFF,   OFF
+#endif
+
+    /* if (!(N >> 2)) goto L_N3 */
+    srai.d   J,     N,     2     /* J = bn >> 2 */
+    andi     N,     N,     0x03
+    vldrepl.d  VALPHA, $sp, 104 /* When N < 4, VALPHA will not changed */
+    beq      ZERO,  J,     .L_N3
+
+.L_J1: /* J-- && This loop include Condition 1 */
+
+/************************* Condition 1 if((N >> 2) && (M >> 3)) START !!! *************************
+*                                                   dgemm_core_16x4                                */
+    move     C0,    C
+    move     A0,    A
+    slli.d   T0,    LDC,   3
+    add.d    C1,    C0,    T0
+    addi.d   J,     J,     -1   /* J-- */
+    add.d    C2,    C1,    T0
+    add.d    C3,    C2,    T0
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 3)) goto L_M8 */
+    srai.d   I,     M,     3     /* I = bm >> 3 */
+    beq      ZERO,  I,     .L_M8
+
+.L_I1: /* I-- */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x06
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  8
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    /* Calculate the first set of D0~D15,
+     * avoidig set 0 operation
+     * Load 8 * 64 from A0
+     * U0 = {a1,  a0}
+     * U1 = {a3,  a2}
+     * U2 = {a5,  a4}
+     * U3 = {a7,  a6}
+     */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vldrepl.d     U4, B0, 0x00
+    preld         0,  C0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+    vfmul.d  D1,  U1, U4
+    preld    0,   C0, 0x20
+    vfmul.d  D2,  U2, U4
+    vfmul.d  D3,  U3, U4
+
+    vldrepl.d     U5, B0, 0x08
+    preld         0,  C1, 0x00
+    /* line 2 */
+    vfmul.d  D4,  U0, U5
+    vfmul.d  D5,  U1, U5
+    preld    0,   C1, 0x20
+    vfmul.d  D6,  U2, U5
+    vfmul.d  D7,  U3, U5
+
+    vldrepl.d     U6, B0, 0x10
+    preld         0,  C2, 0x00
+    /* line 3 */
+    vfmul.d  D8,  U0, U6
+    vfmul.d  D9,  U1, U6
+    preld    0,   C2, 0x20
+    vfmul.d  D10, U2, U6
+    vfmul.d  D11, U3, U6
+
+    vldrepl.d     U7, B0, 0x18
+    preld         0,  C3, 0x00
+    /* line 4 */
+    vfmul.d  D12, U0, U7
+    vfmul.d  D13, U1, U7
+    preld    0,   C3, 0x20
+    vfmul.d  D14, U2, U7
+    vfmul.d  D15, U3, U7
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x40
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_L7 */
+    beq       ZERO,TL, .L_L7
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+    vld     U10,  A0,    0x20
+    vld     U11,  A0,    0x30
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    vldrepl.d  U13,   B0,  0x08
+    vldrepl.d  U14,   B0,  0x10
+    vldrepl.d  U15,   B0,  0x18
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_TL1_END
+.L_TL1: /* TL-- */
+    KERNEL8x8x4
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_TL1
+
+.L_TL1_END:
+    KERNEL8x8x4_END
+
+   /* Maybe we need calculate the last
+    * 7 sets of D0~D15?
+    */
+.L_L7:
+    /* if (!(L & 7)) goto L_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_L0
+
+.L_L71:
+    /* Load 16 * 64 from A0 */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    /* Cumulative D0~D15 */
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+    vfmadd.d  D6,  U2, U5, D6
+    vfmadd.d  D7,  U3, U5, D7
+
+    vldrepl.d U6,  B0, 0x10
+    vfmadd.d  D8,  U0, U6, D8
+    vfmadd.d  D9,  U1, U6, D9
+    vfmadd.d  D10, U2, U6, D10
+    vfmadd.d  D11, U3, U6, D11
+
+    vldrepl.d U7,  B0, 0x18
+    vfmadd.d  D12, U0, U7, D12
+    vfmadd.d  D13, U1, U7, D13
+    vfmadd.d  D14, U2, U7, D14
+    vfmadd.d  D15, U3, U7, D15
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_L71
+
+.L_L0:
+    vldrepl.d  VALPHA, $sp, 104
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+    vfmul.d  D2,   D2,  VALPHA
+    vfmul.d  D3,   D3,  VALPHA
+    vfmul.d  D4,   D4,  VALPHA
+    vfmul.d  D5,   D5,  VALPHA
+    vfmul.d  D6,   D6,  VALPHA
+    vfmul.d  D7,   D7,  VALPHA
+    vfmul.d  D8,   D8,  VALPHA
+    vfmul.d  D9,   D9,  VALPHA
+    vfmul.d  D10,  D10, VALPHA
+    vfmul.d  D11,  D11, VALPHA
+    vfmul.d  D12,  D12, VALPHA
+    vfmul.d  D13,  D13, VALPHA
+    vfmul.d  D14,  D14, VALPHA
+    vfmul.d  D15,  D15, VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C0,  0x10
+    vld      U2,  C0,  0x20
+    vld      U3,  C0,  0x30
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    vfmadd.d D1,  D1,  VALPHA,  U1
+    vfmadd.d D2,  D2,  VALPHA,  U2
+    vfmadd.d D3,  D3,  VALPHA,  U3
+
+    /* Load C1  */
+    vld      U4,  C1,  0x00
+    vld      U5,  C1,  0x10
+    vld      U6,  C1,  0x20
+    vld      U7,  C1,  0x30
+    vfmadd.d D4,  D4,  VALPHA,  U4
+    vfmadd.d D5,  D5,  VALPHA,  U5
+    vfmadd.d D6,  D6,  VALPHA,  U6
+    vfmadd.d D7,  D7,  VALPHA,  U7
+
+    /* Load C2  */
+    vld      U8,  C2,  0x00
+    vld      U9,  C2,  0x10
+    vld      U10, C2,  0x20
+    vld      U11, C2,  0x30
+    vfmadd.d D8,  D8,  VALPHA,  U8
+    vfmadd.d D9,  D9,  VALPHA,  U9
+    vfmadd.d D10, D10, VALPHA,  U10
+    vfmadd.d D11, D11, VALPHA,  U11
+
+    /* Load C3  */
+    vld      U0,  C3,  0x00
+    vld      U1,  C3,  0x10
+    vld      U2,  C3,  0x20
+    vld      U3,  C3,  0x30
+    vfmadd.d D12, D12, VALPHA,  U0
+    vfmadd.d D13, D13, VALPHA,  U1
+    vfmadd.d D14, D14, VALPHA,  U2
+    vfmadd.d D15, D15, VALPHA,  U3
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    vst      D0,  C0,  0x00
+    vst      D1,  C0,  0x10
+    vst      D2,  C0,  0x20
+    vst      D3,  C0,  0x30
+    /* Store C1 */
+    vst      D4,  C1,  0x00
+    vst      D5,  C1,  0x10
+    vst      D6,  C1,  0x20
+    vst      D7,  C1,  0x30
+    /* Store C2 */
+    vst      D8,  C2,  0x00
+    vst      D9,  C2,  0x10
+    vst      D10, C2,  0x20
+    vst      D11, C2,  0x30
+    /* Store C3 */
+    vst      D12, C3,  0x00
+    vst      D13, C3,  0x10
+    vst      D14, C3,  0x20
+    vst      D15, C3,  0x30
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x40
+    addi.d    C1,  C1,  0x40
+    addi.d    C2,  C2,  0x40
+    addi.d    C3,  C3,  0x40
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values  in A */
+    addi.d    L,   L,   -8
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x06
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF, OFF, 0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_I1
+
+.L_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   7
+    beq       ZERO,I,   .L_M0
+
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  4
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 4 * 64 from A0 */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+    vfmul.d  D1,  U1, U4
+
+    vldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    vfmul.d  D4,  U0, U5
+    vfmul.d  D5,  U1, U5
+
+    vldrepl.d     U6, B0, 0x10
+    /* line 3 */
+    vfmul.d  D8,  U0, U6
+    vfmul.d  D9,  U1, U6
+
+    vldrepl.d     U7, B0, 0x18
+    /* line 4 */
+    vfmul.d  D12, U0, U7
+    vfmul.d  D13, U1, U7
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x20
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M4_L7 */
+    beq       ZERO,TL, .L_M4_L7
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    vldrepl.d  U13,   B0,  0x08
+    vldrepl.d  U14,   B0,  0x10
+    vldrepl.d  U15,   B0,  0x18
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_M4_TL1_END
+
+.L_M4_TL1: /* TL-- */
+    KERNEL8x4x4
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_TL1
+
+.L_M4_TL1_END:
+    KERNEL8x4x4_END
+
+.L_M4_L7:
+    /* if (!(L & 7)) goto L_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M4_L0
+
+.L_M4_L71:
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vldrepl.d U6,  B0, 0x10
+    vfmadd.d  D8,  U0, U6, D8
+    vfmadd.d  D9,  U1, U6, D9
+
+    vldrepl.d U7,  B0, 0x18
+    vfmadd.d  D12, U0, U7, D12
+    vfmadd.d  D13, U1, U7, D13
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_L71
+
+.L_M4_L0:
+    vldrepl.d  VALPHA, $sp, 104
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+    vfmul.d  D4,   D4,  VALPHA
+    vfmul.d  D5,   D5,  VALPHA
+    vfmul.d  D8,   D8,  VALPHA
+    vfmul.d  D9,   D9,  VALPHA
+    vfmul.d  D12,  D12, VALPHA
+    vfmul.d  D13,  D13, VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C0,  0x10
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    vfmadd.d D1,  D1,  VALPHA,  U1
+
+    /* Load C1  */
+    vld      U2,  C1,  0x00
+    vld      U3,  C1,  0x10
+    vfmadd.d D4,  D4,  VALPHA,  U2
+    vfmadd.d D5,  D5,  VALPHA,  U3
+
+    /* Load C2  */
+    vld      U4,  C2,  0x00
+    vld      U5,  C2,  0x10
+    vfmadd.d D8,  D8,  VALPHA,  U4
+    vfmadd.d D9,  D9,  VALPHA,  U5
+
+    /* Load C3  */
+    vld      U6,  C3,  0x00
+    vld      U7,  C3,  0x10
+    vfmadd.d D12, D12, VALPHA,  U6
+    vfmadd.d D13, D13, VALPHA,  U7
+#endif   // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    vst      D0,  C0,  0x00
+    vst      D1,  C0,  0x10
+    /* Store C1 */
+    vst      D4,  C1,  0x00
+    vst      D5,  C1,  0x10
+    /* Store C2 */
+    vst      D8,  C2,  0x00
+    vst      D9,  C2,  0x10
+    /* Store C3 */
+    vst      D12, C3,  0x00
+    vst      D13, C3,  0x10
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x20
+    addi.d    C1,  C1,  0x20
+    addi.d    C2,  C2,  0x20
+    addi.d    C3,  C3,  0x20
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -4
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x05
+    add.d     A0,  A0, T0
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N >> 2 ) && (M & 4) ) End************/
+
+.L_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x04
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  2
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 2 * 64 from A0 */
+    vldrepl.d     U0,   A0,    0x00
+    vldrepl.d     U1,   A0,    0x08
+
+    vld    U4,     B0,     0x00
+    vld    U5,     B0,     0x10
+
+    vfmul.d    D0,     U0,     U4
+    vfmul.d    D1,     U0,     U5
+    vfmul.d    D2,     U1,     U4
+    vfmul.d    D3,     U1,     U5
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M2_L7 */
+    beq       ZERO,TL, .L_M2_L7
+
+    vldrepl.d     U8,   A0,    0x00
+    vldrepl.d     U9,   A0,    0x08
+
+    addi.d    TL,  TL,  -1
+
+    vld       U12, B0, 0x00
+    vld       U13, B0, 0x10
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_M2_TL1_END
+.L_M2_TL1: /* TL-- */
+    KERNEL8x2x4
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M2_TL1
+.L_M2_TL1_END:
+    KERNEL8x2x4_END
+
+.L_M2_L7:
+    /* if (!(L & 7)) goto L_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M2_L0
+
+.L_M2_L71:
+    vldrepl.d     U0,   A0,    0x00
+    vldrepl.d     U1,   A0,    0x08
+
+    vld    U4,  B0, 0x00
+    vld    U5,  B0, 0x10
+
+    vfmadd.d   D0,     U0,     U4,    D0
+    vfmadd.d   D1,     U0,     U5,    D1
+    vfmadd.d   D2,     U1,     U4,    D2
+    vfmadd.d   D3,     U1,     U5,    D3
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M2_L71
+
+.L_M2_L0:
+    vldrepl.d  VALPHA, $sp, 104
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+    vfmul.d  D2,   D2,  VALPHA
+    vfmul.d  D3,   D3,  VALPHA
+
+    vstelm.d  D0,  C0,  0x00,    0x00
+    vstelm.d  D0,  C1,  0x00,    0x01
+    vstelm.d  D1,  C2,  0x00,    0x00
+    vstelm.d  D1,  C3,  0x00,    0x01
+    vstelm.d  D2,  C0,  0x08,    0x00
+    vstelm.d  D2,  C1,  0x08,    0x01
+    vstelm.d  D3,  C2,  0x08,    0x00
+    vstelm.d  D3,  C3,  0x08,    0x01
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    /* Load C1  */
+    vld      U1,  C1,  0x00
+    /* Load C2  */
+    vld      U2,  C2,  0x00
+    /* Load C3  */
+    vld      U3,  C3,  0x00
+
+    vilvl.d  D4,  D2,  D0   //C0
+    vilvh.d  D5,  D2,  D0   //C1
+    vilvl.d  D6,  D3,  D1   //C2
+    vilvh.d  D7,  D3,  D1   //C3
+
+    vfmadd.d D0,  D4,  VALPHA,  U0
+    vfmadd.d D2,  D5,  VALPHA,  U1
+    vfmadd.d D1,  D6,  VALPHA,  U2
+    vfmadd.d D3,  D7,  VALPHA,  U3
+
+    vst      D0,  C0,  0x00
+    vst      D2,  C1,  0x00
+    vst      D1,  C2,  0x00
+    vst      D3,  C3,  0x00
+#endif   // #if defined(TRMMKERNEL)
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x10
+    addi.d    C1,  C1,  0x10
+    addi.d    C2,  C2,  0x10
+    addi.d    C3,  C3,  0x10
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -2
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x04
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N >> 2 ) && (M & 2) ) End************/
+
+.L_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x03
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  1
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    vldrepl.d  U0,     A0,     0x00
+    vld        U4,     B0,     0x00
+    vld        U5,     B0,     0x10
+    vfmul.d    D0,     U0,     U4
+    vfmul.d    D1,     U0,     U5
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M1_L7 */
+    beq       ZERO,TL, .L_M1_L7
+
+    vldrepl.d  U8,     A0,     0x00
+
+    addi.d     TL,  TL,  -1
+    vld        U12, B0,  0x00
+    vld        U13, B0,  0x10
+    addi.d     A0,  A0,  0x08
+    addi.d     B0,  B0,  0x20
+
+    beq    ZERO,    TL,  .L_M1_TL1_END
+
+.L_M1_TL1: /* TL-- */
+    KERNEL8x1x4
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M1_TL1
+.L_M1_TL1_END:
+    KERNEL8x1x4_END
+
+.L_M1_L7:
+    /* if (!(L & 7)) goto L_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M1_L0
+
+.L_M1_L71:
+    vldrepl.d  U0,     A0,     0x00
+    vld        U4,     B0,     0x00
+    vld        U5,     B0,     0x10
+    vfmadd.d   D0,     U0,     U4,    D0
+    vfmadd.d   D1,     U0,     U5,    D1
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M1_L71
+
+.L_M1_L0:
+    vldrepl.d  VALPHA, $sp, 104
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+
+    vstelm.d D0,  C0,  0x00,    0x00
+    vstelm.d D0,  C1,  0x00,    0x01
+    vstelm.d D1,  C2,  0x00,    0x00
+    vstelm.d D1,  C3,  0x00,    0x01
+#else
+    /* Load C0  */
+    vldrepl.d     U0,  C0,  0x00
+    vldrepl.d     U1,  C1,  0x00
+    vilvl.d       D4,  U1,  U0
+    vfmadd.d D6,  D0,  VALPHA,  D4
+
+    vldrepl.d     U2,  C2,  0x00
+    vldrepl.d     U3,  C3,  0x00
+    vilvl.d       D5,  U3,  U2
+    vfmadd.d D7,  D1,  VALPHA,  D5
+
+    vstelm.d D6,  C0,  0x00,    0x00
+    vstelm.d D6,  C1,  0x00,    0x01
+    vstelm.d D7,  C2,  0x00,    0x00
+    vstelm.d D7,  C3,  0x00,    0x01
+#endif   // #if defined(TRMMKERNEL)
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x08
+    addi.d    C1,  C1,  0x08
+    addi.d    C2,  C2,  0x08
+    addi.d    C3,  C3,  0x08
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -1
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x03
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N >> 2 ) && (M & 1) ) End************/
+
+.L_M0:
+    /* Add stride for B and C
+     * B += (K * 32)
+     * C += (LDC * 32)
+     */
+    /* since the array type is double,
+     * so we must mul 32
+     */
+    slli.d    T0,   K,   5
+    slli.d    T1,   LDC, 5
+    add.d     B,    B,   T0
+    add.d     C,    C,   T1
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d    OFF,  OFF, 0x04
+#endif
+
+    blt      ZERO,  J,   .L_J1
+
+//////////////// go back to L_J1 /////////////////
+/////////////////////////////////////////////////
+/************************ Condition 1 if((N >> 2) && (M >> 3)) END !!! ************************/
+
+    vldrepl.d  VALPHA, $sp, 104
+
+.L_N3:
+    andi     J,    N,   2
+    beq      ZERO, J,   .L_N1
+
+/************************* Condition 2 if((N & 2) && (M >> 3)) START !!! *************************
+*                                                   dgemm_core_16x2                                */
+
+    move     C0,    C
+    move     A0,    A
+    slli.d   T0,    LDC,   3
+    add.d    C1,    C0,    T0
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 3)) goto L_N3_M8 */
+    srai.d   I,     M,     3     /* I = bm >> 3 */
+    beq      ZERO,  I,     .L_N3_M8
+
+.L_N3_I1:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x06
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  8
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 8 * 64 from A0
+     * U0 = {a1,  a0}
+     * U1 = {a3,  a2}
+     * U2 = {a5,  a4}
+     * U3 = {a7,  a6}
+     */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+    vfmul.d  D1,  U1, U4
+    vfmul.d  D2,  U2, U4
+    vfmul.d  D3,  U3, U4
+
+    vldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    vfmul.d  D4,  U0, U5
+    vfmul.d  D5,  U1, U5
+    vfmul.d  D6,  U2, U5
+    vfmul.d  D7,  U3, U5
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x40
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_L7 */
+    beq       ZERO,TL, .L_N3_L7
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+    vld     U10,  A0,    0x20
+    vld     U11,  A0,    0x30
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    vldrepl.d  U13,   B0,  0x08
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_N3_TL1_END
+
+.L_N3_TL1: /* TL-- */
+    KERNEL8x8x2
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_TL1
+.L_N3_TL1_END:
+    KERNEL8x8x2_END
+
+.L_N3_L7:
+    /* if (!(L & 7)) goto L_N3_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_L0
+
+.L_N3_L71:
+    /* Load 16 * 64 from A0 */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+    vfmadd.d  D6,  U2, U5, D6
+    vfmadd.d  D7,  U3, U5, D7
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_L71
+
+.L_N3_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+    vfmul.d  D2,   D2,  VALPHA
+    vfmul.d  D3,   D3,  VALPHA
+    vfmul.d  D4,   D4,  VALPHA
+    vfmul.d  D5,   D5,  VALPHA
+    vfmul.d  D6,   D6,  VALPHA
+    vfmul.d  D7,   D7,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C0,  0x10
+    vld      U2,  C0,  0x20
+    vld      U3,  C0,  0x30
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    vfmadd.d D1,  D1,  VALPHA,  U1
+    vfmadd.d D2,  D2,  VALPHA,  U2
+    vfmadd.d D3,  D3,  VALPHA,  U3
+
+    /* Load C1  */
+    vld      U4,  C1,  0x00
+    vld      U5,  C1,  0x10
+    vld      U6,  C1,  0x20
+    vld      U7,  C1,  0x30
+    vfmadd.d D4,  D4,  VALPHA,  U4
+    vfmadd.d D5,  D5,  VALPHA,  U5
+    vfmadd.d D6,  D6,  VALPHA,  U6
+    vfmadd.d D7,  D7,  VALPHA,  U7
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    vst      D0,  C0,  0x00
+    vst      D1,  C0,  0x10
+    vst      D2,  C0,  0x20
+    vst      D3,  C0,  0x30
+    /* Store C1 */
+    vst      D4,  C1,  0x00
+    vst      D5,  C1,  0x10
+    vst      D6,  C1,  0x20
+    vst      D7,  C1,  0x30
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x40
+    addi.d    C1,  C1,  0x40
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -8
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x06
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x04
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x8
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N3_I1
+
+.L_N3_M8:
+    /* We have done M & 8, considering M=4/2/1 */
+    andi      I,   M,   7
+    beq       ZERO,I,   .L_N3_M0
+
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N3_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  4
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 4 * 64 from A0 */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+    vfmul.d  D1,  U1, U4
+
+    vldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    vfmul.d  D4,  U0, U5
+    vfmul.d  D5,  U1, U5
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x20
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M4_L7 */
+    beq       ZERO,TL, .L_N3_M4_L7
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    vldrepl.d  U13,   B0,  0x08
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_N3_M4_TL1_END
+
+.L_N3_M4_TL1: /* TL-- */
+    KERNEL8x4x2
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_M4_TL1
+.L_N3_M4_TL1_END:
+    KERNEL8x4x2_END
+
+.L_N3_M4_L7:
+    /* if (!(L & 7)) goto L_N3_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_M4_L0
+
+.L_N3_M4_L71:
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_M4_L71
+
+.L_N3_M4_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+    vfmul.d  D4,   D4,  VALPHA
+    vfmul.d  D5,   D5,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C0,  0x10
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    vfmadd.d D1,  D1,  VALPHA,  U1
+
+    /* Load C1  */
+    vld      U2,  C1,  0x00
+    vld      U3,  C1,  0x10
+    vfmadd.d D4,  D4,  VALPHA,  U2
+    vfmadd.d D5,  D5,  VALPHA,  U3
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    vst      D0,  C0,  0x00
+    vst      D1,  C0,  0x10
+    /* Store C1 */
+    vst      D4,  C1,  0x00
+    vst      D5,  C1,  0x10
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x20
+    addi.d    C1,  C1,  0x20
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -4
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x05
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x04
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 2 ) && (M & 4) ) End************/
+
+.L_N3_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N3_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x04
+    add.d    A0,    A0,   T0
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  2
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 2 * 64 from A0 */
+    vld     U0,   A0,    0x00
+
+    vldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+
+    vldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    vfmul.d  D4,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M2_L7 */
+    beq       ZERO,TL, .L_N3_M2_L7
+
+    vld     U8,   A0,    0x00
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    vldrepl.d  U13,   B0,  0x08
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_N3_M2_TL1_END
+
+.L_N3_M2_TL1: /* TL-- */
+    KERNEL8x2x2
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_M2_TL1
+.L_N3_M2_TL1_END:
+    KERNEL8x2x2_END
+
+.L_N3_M2_L7:
+    /* if (!(L & 7)) goto L_N3_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_M2_L0
+
+.L_N3_M2_L71:
+    vld     U0,   A0,    0x00
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D0,  U0, U4, D0
+
+    vfmadd.d  D4,  U0, U5, D4
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_M2_L71
+
+.L_N3_M2_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D4,   D4,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+
+    /* Load C1  */
+    vld      U1,  C1,  0x00
+    vfmadd.d D4,  D4,  VALPHA,  U1
+#endif // #if defined(TRMMKERNEL)
+
+    vst       D0,  C0,  0x00
+    vst       D4,  C1,  0x00
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x10
+    addi.d    C1,  C1,  0x10
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -2
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x04
+    add.d     A0,  A0, T0
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 2 ) && (M & 2) ) End************/
+
+.L_N3_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N3_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x03
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  1
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 1 * 64 from A0 */
+    vldrepl.d   U0,   A0,  0x00
+
+    vld         U4,   B0,  0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M1_L7 */
+    beq       ZERO,TL, .L_N3_M1_L7
+
+    vldrepl.d  U8,   A0,    0x00
+
+    addi.d    TL,  TL,  -1
+
+    vld        U12,  B0,    0x00
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_N3_M1_TL1_END
+
+.L_N3_M1_TL1: /* TL-- */
+    KERNEL8x1x2
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_M1_TL1
+.L_N3_M1_TL1_END:
+    KERNEL8x1x2_END
+
+.L_N3_M1_L7:
+    /* if (!(L & 7)) goto L_N3_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_M1_L0
+
+.L_N3_M1_L71:
+    vldrepl.d  U0,  A0,    0x00
+
+    vld        U4,  B0,    0x00
+    vfmadd.d   D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_M1_L71
+
+.L_N3_M1_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C1,  0x00
+    vilvl.d  U2,  U1,  U0
+    vfmadd.d D0,  D0,  VALPHA,  U2
+#endif // #if defined(TRMMKERNEL)
+
+    vstelm.d D0,  C0,  0x00,  0x00
+    vstelm.d D0,  C1,  0x00,  0x01
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x08
+    addi.d    C1,  C1,  0x08
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -1
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x03
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x04
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 2 ) && (M & 1) ) End************/
+
+.L_N3_M0:
+    /* Add stride for B and C
+     * B += (K * 16)
+     * C += (LDC * 16)
+     */
+    /* since the array type is double,
+     * so we must mul 16
+     */
+    slli.d    T0,   K,   4
+    slli.d    T1,   LDC, 4
+    add.d     B,    B,   T0
+    add.d     C,    C,   T1
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d    OFF,  OFF, 0x02
+#endif
+
+    /* We must reinit I */
+    srai.d   I,     M,   4     /* I = bm >> 4 */
+
+/************************* Condition 2 if((N & 2) && (M >> 3)) End !!! *************************
+*                                                   dgemm_core_16x2                                */
+
+.L_N1:
+    andi     J,    N,   1
+    beq      ZERO, J,   .L_N0
+
+/************************* Condition 3 if((N & 1) && (M >> 3)) START !!! *************************
+*                                                   dgemm_core_16x1                                */
+
+    move     C0,    C
+    move     A0,    A
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 3)) goto L_N1_M8 */
+    srai.d   I,     M,     3     /* I = bm >> 3 */
+    beq      ZERO,  I,     .L_N1_M8
+
+.L_N1_I1:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x06
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  8
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 8 * 64 from A0
+     * U0 = {a3,  a2}
+     * U1 = {a1,  a0}
+     * U2 = {a5,  a4}
+     * U3 = {a7,  a6}
+     */
+
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vldrepl.d   U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+    vfmul.d  D1,  U1, U4
+    vfmul.d  D2,  U2, U4
+    vfmul.d  D3,  U3, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x40
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_L7 */
+    beq       ZERO,TL, .L_N1_L7
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+    vld     U10,  A0,    0x20
+    vld     U11,  A0,    0x30
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_N1_TL1_END
+.L_N1_TL1: /* TL-- */
+    KERNEL8x8x1
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_TL1
+.L_N1_TL1_END:
+    KERNEL8x8x1_END
+
+.L_N1_L7:
+    /* if (!(L & 7)) goto L_N1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_L0
+
+.L_N1_L71:
+    /* Load 16 * 64 from A0 */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_L71
+
+.L_N1_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+    vfmul.d  D2,   D2,  VALPHA
+    vfmul.d  D3,   D3,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C0,  0x10
+    vld      U2,  C0,  0x20
+    vld      U3,  C0,  0x30
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    vfmadd.d D1,  D1,  VALPHA,  U1
+    vfmadd.d D2,  D2,  VALPHA,  U2
+    vfmadd.d D3,  D3,  VALPHA,  U3
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    vst      D0,  C0,  0x00
+    vst      D1,  C0,  0x10
+    vst      D2,  C0,  0x20
+    vst      D3,  C0,  0x30
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x40
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -8
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x06
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x8
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N1_I1
+
+.L_N1_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   7
+    beq       ZERO,I,   .L_N1_M0
+
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N1_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  4
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 4 * 64 from A0 */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+    vfmul.d  D1,  U1, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x20
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M4_L7 */
+    beq       ZERO,TL, .L_N1_M4_L7
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_N1_M4_TL1_END
+
+.L_N1_M4_TL1: /* TL-- */
+    KERNEL8x4x1
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M4_TL1
+.L_N1_M4_TL1_END:
+    KERNEL8x4x1_END
+
+.L_N1_M4_L7:
+    /* if (!(L & 7)) goto L_N1_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M4_L0
+
+.L_N1_M4_L71:
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M4_L71
+
+.L_N1_M4_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C0,  0x10
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    vfmadd.d D1,  D1,  VALPHA,  U1
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    vst      D0,  C0,  0x00
+    vst      D1,  C0,  0x10
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x20
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -4
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x05
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 1) && (M & 4) ) End************/
+
+.L_N1_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N1_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x04
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  2
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 2 * 64 from A0 */
+    vld     U0,   A0,    0x00
+
+    vldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M2_L7 */
+    beq       ZERO,TL, .L_N1_M2_L7
+
+    vld     U8,   A0,    0x00
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_N1_M2_TL1_END
+
+.L_N1_M2_TL1: /* TL-- */
+    KERNEL8x2x1
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M2_TL1
+.L_N1_M2_TL1_END:
+    KERNEL8x2x1_END
+
+.L_N1_M2_L7:
+    /* if (!(L & 7)) goto L_N1_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M2_L0
+
+.L_N1_M2_L71:
+    vld     U0,   A0,    0x00
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M2_L71
+
+.L_N1_M2_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+#endif // #if defined(TRMMKERNEL)
+
+    vstelm.d D0,  C0,  0x00,    0x00
+    vstelm.d D0,  C0,  0x08,    0x01
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x10
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -2
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x04
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 1 ) && (M & 2) ) End************/
+
+.L_N1_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N1_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x03
+    add.d    A0,    A0,   T0
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  1
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 1 * 64 from A0 */
+    vldrepl.d  U0,   A0,    0x00
+
+    vldrepl.d  U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M1_L7 */
+    beq       ZERO,TL, .L_N1_M1_L7
+
+    vldrepl.d  U8,   A0,    0x00
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_N1_M1_TL1_END
+
+.L_N1_M1_TL1: /* TL-- */
+    KERNEL8x1x1
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M1_TL1
+.L_N1_M1_TL1_END:
+    KERNEL8x1x1_END
+
+.L_N1_M1_L7:
+    /* if (!(L & 7)) goto L_N1_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M1_L0
+
+.L_N1_M1_L71:
+    vldrepl.d  U0,   A0,    0x00
+
+    vldrepl.d  U4,  B0, 0x00
+    vfmadd.d   D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M1_L71
+
+.L_N1_M1_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+#else
+    /* Load C0  */
+    vldrepl.d  U0,  C0,  0x00
+    vfmadd.d  D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+#endif // #if defined(TRMMKERNEL)
+
+    vstelm.d D0,  C0,  0x00,  0x00
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x08
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -1
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x03
+    add.d     A0,  A0, T0
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 1 ) && (M & 1) ) End************/
+
+.L_N1_M0:
+
+/************************* Condition 3 if((N & 1) && (M >> 3)) End !!! *************************
+*                                                   dgemm_core_16x1                                */
+
+.L_N0:
+    /* Restore regs */
+    LDARG    $r23,  $sp,   0
+    LDARG    $r24,  $sp,   8
+    LDARG    $r25,  $sp,   16
+    LDARG    $r26,  $sp,   24
+    LDARG    $r27,  $sp,   32
+    LD       $f24,  $sp,   40
+    LD       $f25,  $sp,   48
+    LD       $f26,  $sp,   56
+    LD       $f27,  $sp,   64
+    LD       $f28,  $sp,   72
+    LD       $f29,  $sp,   80
+    LD       $f30,  $sp,   88
+    LD       $f31,  $sp,   96
+    addi.d   $sp,   $sp,   112
+
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_ncopy_4_lsx.S b/kernel/loongarch64/dgemm_ncopy_4_lsx.S
new file mode 100644
index 000000000..048a49af6
--- /dev/null
+++ b/kernel/loongarch64/dgemm_ncopy_4_lsx.S
@@ -0,0 +1,185 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r21
+#define TL     $r7
+#define T0     $r6
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+
+    PROLOGUE
+
+    move       TD,   DST
+    move       TS,   SRC
+    slli.d     TL,   LDA,  0x03
+    slli.d     T0,   TL,   0x01
+    srai.d     J,    N,    0x02
+    beq        J,    ZERO, .L_N2
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x02
+    add.d      S3,   S2,   TL
+    add.d      S4,   S2,   T0
+    add.d      TS,   S3,   T0
+    addi.d     J,    J,    -1
+    beq        I,    ZERO, .L_I3
+.L_I1: /* I-- */
+    GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00
+    GINTERLACE v, d, D0, D2, U1, U0
+    GINTERLACE v, d, D1, D3, U3, U2
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
+    addi.d     TD,   TD,   0x40
+
+    GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10
+    GINTERLACE v, d, D0, D2, U1, U0
+    GINTERLACE v, d, D1, D3, U3, U2
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+.L_I3:
+    andi      I,     M,    0x03
+    beq       I,     ZERO, .L_I0
+.L_II1:
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+
+    addi.d    TD,    TD,  0x20
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_II1
+.L_I0:
+    blt       ZERO,  J,   .L_J1
+.L_N2:
+    andi      J,     N,   0x02
+    beq       ZERO,  J,   .L_N1
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x01
+    add.d      TS,   S2,   TL
+    beq        I,    ZERO, .L_2I3
+.L_2I1: /* I-- */
+    GLD v, , U0, S1, 0x00, U1, S2, 0x00
+    GINTERLACE v, d, D0, D1, U1, U0
+    GST v, , D0, TD, 0x00, D1, TD, 0x10
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     TD,   TD,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_2I1
+.L_2I3:
+    andi       I,    M,    0x01
+    beq        ZERO, I,    .L_N1
+.L_2II1: /* I-- */
+    fld.d      F0,   S1,   0x00
+    fld.d      F1,   S2,   0x00
+    fst.d      F0,   TD,   0x00
+    addi.d     I,    I,    -1
+    fst.d      F1,   TD,   0x08
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     TD,   TD,   0x10
+    blt        ZERO, I,    .L_2II1
+.L_N1:
+    move      S1,    TS
+    beq       ZERO,  M,   .L_N0
+.L_M1:
+    fld.d     F0,    S1,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F0,    TD,  0x00
+    addi.d    TD,    TD,  0x08
+    addi.d    M,     M,   -1
+    blt       ZERO,  M,   .L_M1
+.L_N0:
+    jirl       $r0,  $r1,  0x00
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_ncopy_8_lsx.S b/kernel/loongarch64/dgemm_ncopy_8_lsx.S
new file mode 100644
index 000000000..30bebe8df
--- /dev/null
+++ b/kernel/loongarch64/dgemm_ncopy_8_lsx.S
@@ -0,0 +1,283 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r21
+#define TL     $r7
+#define T0     $r6
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+
+    PROLOGUE
+    push_if_used 26, 32
+    move       TD,   DST
+    move       TS,   SRC
+    slli.d     TL,   LDA,  0x03
+    slli.d     T0,   TL,   0x01
+    srai.d     J,    N,    0x03
+    beq        J,    ZERO, .L_N4
+.L_J1:
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x03
+    add.d      S3,   S2,   TL
+    addi.d     J,    J,    -1
+    add.d      S4,   S3,   TL
+    add.d      S5,   S3,   T0
+    add.d      S6,   S4,   T0
+    add.d      S7,   S5,   T0
+    add.d      S8,   S6,   T0
+    add.d      TS,   S7,   T0
+    beq        I,    ZERO, .L_I7
+.L_I1:
+    GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00, \
+             U4, S5, 0x00, U5, S6, 0x00, U6, S7, 0x00, U7, S8, 0x00
+    GINTERLACE v, d, D0, D4, U1, U0
+    GINTERLACE v, d, D1, D5, U3, U2
+    GINTERLACE v, d, D2, D6, U5, U4
+    GINTERLACE v, d, D3, D7, U7, U6
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \
+             D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70
+    addi.d    TD,    TD,   0x80
+    GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10, \
+             U4, S5, 0x10, U5, S6, 0x10, U6, S7, 0x10, U7, S8, 0x10
+    GINTERLACE v, d, D0, D4, U1, U0
+    GINTERLACE v, d, D1, D5, U3, U2
+    GINTERLACE v, d, D2, D6, U5, U4
+    GINTERLACE v, d, D3, D7, U7, U6
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \
+             D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70
+    addi.d    TD,    TD,   0x80
+    GLD v, , U0, S1, 0x20, U1, S2, 0x20, U2, S3, 0x20, U3, S4, 0x20, \
+             U4, S5, 0x20, U5, S6, 0x20, U6, S7, 0x20, U7, S8, 0x20
+    GINTERLACE v, d, D0, D4, U1, U0
+    GINTERLACE v, d, D1, D5, U3, U2
+    GINTERLACE v, d, D2, D6, U5, U4
+    GINTERLACE v, d, D3, D7, U7, U6
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \
+             D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70
+    addi.d    TD,    TD,   0x80
+    GLD v, , U0, S1, 0x30, U1, S2, 0x30, U2, S3, 0x30, U3, S4, 0x30, \
+             U4, S5, 0x30, U5, S6, 0x30, U6, S7, 0x30, U7, S8, 0x30
+    GINTERLACE v, d, D0, D4, U1, U0
+    GINTERLACE v, d, D1, D5, U3, U2
+    GINTERLACE v, d, D2, D6, U5, U4
+    GINTERLACE v, d, D3, D7, U7, U6
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \
+             D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70
+    addi.d    TD,    TD,   0x80
+
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    addi.d     S5,   S5,   0x40
+    addi.d     S6,   S6,   0x40
+    addi.d     S7,   S7,   0x40
+    addi.d     S8,   S8,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+.L_I7:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_I0
+.L_II1: /* I-- */
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+    fld.d     F4,    S5,  0x00
+    fld.d     F5,    S6,  0x00
+    fld.d     F6,    S7,  0x00
+    fld.d     F7,    S8,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+    fst.d     F4,    TD,  0x20
+    addi.d    S5,    S5,  0x08
+    fst.d     F5,    TD,  0x28
+    addi.d    S6,    S6,  0x08
+    fst.d     F6,    TD,  0x30
+    addi.d    S7,    S7,  0x08
+    fst.d     F7,    TD,  0x38
+    addi.d    S8,    S8,  0x08
+    addi.d    TD,    TD,  0x40
+
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_II1
+.L_I0:
+    blt       ZERO,  J,   .L_J1
+.L_N4:
+    andi      J,     N,   0x04
+    beq       ZERO,  J,   .L_N2
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x02
+    add.d      S3,   S2,   TL
+    add.d      S4,   S2,   T0
+    add.d      TS,   S3,   T0
+    beq        I,    ZERO, .L_I3
+.L_4I1: /* I-- */
+    GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00
+    GINTERLACE v, d, D0, D2, U1, U0
+    GINTERLACE v, d, D1, D3, U3, U2
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
+    addi.d     TD,   TD,   0x40
+
+    GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10
+    GINTERLACE v, d, D0, D2, U1, U0
+    GINTERLACE v, d, D1, D3, U3, U2
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_4I1
+.L_I3:
+    andi      I,     M,    0x03
+    beq       I,     ZERO, .L_N2
+.L_4II1:
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+
+    addi.d    TD,    TD,  0x20
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_4II1
+.L_N2:
+    andi      J,     N,   0x02
+    beq       ZERO,  J,   .L_N1
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x01
+    add.d      TS,   S2,   TL
+    beq        I,    ZERO, .L_NI1
+.L_2I1: /* I-- */
+    GLD v, , U0, S1, 0x00, U1, S2, 0x00
+    GINTERLACE v, d, D0, D1, U1, U0
+    GST v, , D0, TD, 0x00, D1, TD, 0x10
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     TD,   TD,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_2I1
+.L_NI1:
+    andi      I,     M,    0x01
+    beq       I,     ZERO, .L_N1
+
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    addi.d    TD,    TD,  0x10
+.L_N1:
+    move      S1,    TS
+    beq       ZERO,  M,   .L_N0
+.L_M1:
+    fld.d     F0,    S1,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F0,    TD,  0x00
+    addi.d    TD,    TD,  0x08
+    addi.d    M,     M,   -1
+    blt       ZERO,  M,   .L_M1
+.L_N0:
+    pop_if_used 26, 32
+    jirl       $r0,  $r1,  0x00
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_tcopy_4_lsx.S b/kernel/loongarch64/dgemm_tcopy_4_lsx.S
new file mode 100644
index 000000000..134066471
--- /dev/null
+++ b/kernel/loongarch64/dgemm_tcopy_4_lsx.S
@@ -0,0 +1,280 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S0     $r11
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define P0     $r16
+#define P1     $r17
+#define P2     $r18
+#define P3     $r19
+#define T0     $r20
+#define T1     $r23
+#define TL     $r7
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+
+    PROLOGUE
+    push_if_used 18, 8
+
+    move       S0,     SRC
+    move       P0,     DST
+
+    // Find P0, P2, P3
+    srai.d     T0,     N,     0x02
+    slli.d     T0,     T0,    0x02
+    srai.d     T1,     N,     0x01
+    slli.d     T1,     T1,    0x01
+    mul.d      T0,     M,     T0
+    mul.d      T1,     M,     T1
+    slli.d     T0,     T0,    0x03
+    slli.d     T1,     T1,    0x03
+    add.d      P2,     DST,   T0
+    add.d      P3,     DST,   T1
+
+    slli.d     TL,     LDA,   0x03
+    srai.d     J,      M,     0x02
+    slli.d     T0,     TL,    0x01
+    slli.d     T1,     M,     0x05
+    beq        ZERO,   J,     .L_M3
+.L_J1: /* J-- */
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S0,     S3,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x80
+
+    srai.d     I,      N,     0x02
+    addi.d     J,      J,     -1
+    beq        ZERO,   I,     .L_N3
+.L_I1: /* I-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+    vld       U4,     S3,    0x00
+    vld       U5,     S3,    0x10
+    vld       U6,     S4,    0x00
+    vld       U7,     S4,    0x10
+
+    vst       U0,     P1,    0x00
+    vst       U1,     P1,    0x10
+    vst       U2,     P1,    0x20
+    vst       U3,     P1,    0x30
+    vst       U4,     P1,    0x40
+    vst       U5,     P1,    0x50
+    vst       U6,     P1,    0x60
+    vst       U7,     P1,    0x70
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    add.d      P1,     P1,    T1
+
+    addi.d     I,      I,    -1
+    blt        ZERO,   I,    .L_I1
+.L_N3:
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_N1
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S2,    0x00
+    vld       U2,     S3,    0x00
+    vld       U3,     S4,    0x00
+
+    vst       U0,     P2,    0x00
+    vst       U1,     P2,    0x10
+    vst       U2,     P2,    0x20
+    vst       U3,     P2,    0x30
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     P2,     P2,    0x40
+.L_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+    fld.d      F2,     S3,    0x00
+    fld.d      F3,     S4,    0x00
+
+    fst.d      F0,     P3,    0x00
+    fst.d      F1,     P3,    0x08
+    fst.d      F2,     P3,    0x10
+    fst.d      F3,     P3,    0x18
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     P3,     P3,    0x20
+
+.L_N0:
+    blt        ZERO,   J,     .L_J1
+
+.L_M3:
+    andi       J,      M,      0x02
+    beq        ZERO,   J,      .L_M1
+
+    move       S1,     S0
+    add.d      S2,     S0,     TL
+    add.d      S0,     S0,     T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,     0x40
+
+    srai.d     I,      N,      0x02
+    beq        ZERO,   I,      .L_2N3
+
+.L_2I1:   /* I-- */
+    vld       U0,     S1,     0x00
+    vld       U1,     S1,     0x10
+    vld       U2,     S2,     0x00
+    vld       U3,     S2,     0x10
+
+    vst       U0,     P1,     0x00
+    vst       U1,     P1,     0x10
+    vst       U2,     P1,     0x20
+    vst       U3,     P1,     0x30
+
+    addi.d     S1,     S1,     0x20
+    addi.d     S2,     S2,     0x20
+    addi.d     I,      I,      -1
+    add.d      P1,     P1,     T1
+
+    blt        ZERO,   I,     .L_2I1
+
+.L_2N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_2N1
+
+    vld       U0,     S1,     0x00
+    vld       U1,     S2,     0x00
+
+    vst       U0,     P2,     0x00
+    vst       U1,     P2,     0x10
+
+    addi.d     S1,     S1,     0x10
+    addi.d     S2,     S2,     0x10
+    addi.d     P2,     P2,     0x20
+
+.L_2N1:
+    addi.d     I,      N,      0x01
+    beq        ZERO,   I,      .L_M1
+
+    fld.d      F0,     S1,     0x00
+    fld.d      F1,     S2,     0x00
+
+    fst.d      F0,     P3,     0x00
+    fst.d      F1,     P3,     0x08
+
+    addi.d     S1,     S1,     0x08
+    addi.d     S2,     S2,     0x08
+    addi.d     P3,     P3,     0x10
+.L_M1:
+    andi       J,      M,      0x01
+    beq        ZERO,   J,      .L_M0
+
+    move       S1,     S0
+    move       P1,     P0
+
+    srai.d     I,      N,      0x02
+    beq        ZERO,   I,      .L_1N3
+
+.L_1I1:
+    vld       U0,    S1,      0x00
+    vld       U1,    S1,      0x10
+
+    vst       U0,    P1,      0x00
+    vst       U1,    P1,      0x10
+
+    addi.d     S1,    S1,      0x20
+    addi.d     I,     I,       -1
+    add.d      P1,    P1,      T1
+
+    blt        ZERO,  I,       .L_1I1
+
+.L_1N3:
+    andi       I,     N,       0x02
+    beq        I,     ZERO,    .L_1N1
+
+    fld.d      F0,    S1,      0x00
+    fld.d      F1,    S1,      0x08
+
+    fst.d      F0,    P2,      0x00
+    fst.d      F1,    P2,      0x08
+
+    addi.d     S1,    S1,      0x10
+    addi.d     P2,    P2,      0x10
+
+.L_1N1:
+    andi       I,     N,       0x01
+    beq        I,     ZERO,    .L_M0
+
+    fld.d      F0,    S1,      0x00
+
+    fst.d      F0,    P3,      0x00
+
+.L_M0:
+    pop_if_used 18, 8
+    jirl       $r0,    $r1,   0x00
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_tcopy_8_lsx.S b/kernel/loongarch64/dgemm_tcopy_8_lsx.S
new file mode 100644
index 000000000..a7e3ef69c
--- /dev/null
+++ b/kernel/loongarch64/dgemm_tcopy_8_lsx.S
@@ -0,0 +1,597 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S0     $r11
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define P0     $r20
+#define P1     $r23
+#define P2     $r24
+#define P3     $r25
+#define P4     $r26
+#define P5     $r27
+#define T0     $r28
+#define T1     $r29
+#define TL     $r7
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LASX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+
+    PROLOGUE
+    push_if_used 24, 8
+
+    move       S0,     SRC
+    move       P0,     DST
+
+    srai.d     T0,     N,     0x03
+    srai.d     T1,     N,     0x02
+    slli.d     T0,     T0,    0x03
+    slli.d     T1,     T1,    0x02
+    mul.d      P2,     M,     T0
+    mul.d      P3,     M,     T1
+    slli.d     P2,     P2,    0x03
+    slli.d     P3,     P3,    0x03
+    add.d      P2,     DST,   P2
+    add.d      P3,     DST,   P3
+
+    srai.d     T0,     N,     0x01
+    slli.d     T0,     T0,    0x01
+    mul.d      P4,     M,     T0
+    slli.d     P4,     P4,    0x03
+    add.d      P4,     DST,   P4
+
+    slli.d     TL,     LDA,   0x03
+    srai.d     J,      M,     0x03
+    slli.d     T0,     TL,    0x01
+    slli.d     T1,     M,     0x06
+    beq        ZERO,   J,     .L_M7
+.L_J1: /* J-- */
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S5,     S3,    T0
+    add.d      S6,     S4,    T0
+    add.d      S7,     S5,    T0
+    add.d      S8,     S6,    T0
+    add.d      S0,     S7,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x200
+
+    srai.d     I,      N,     0x03
+    addi.d     J,      J,     -1
+    beq        ZERO,   I,     .L_N7
+
+.L_I1: /* I-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+    vld       U4,     S2,    0x00
+    vld       U5,     S2,    0x10
+    vld       U6,     S2,    0x20
+    vld       U7,     S2,    0x30
+
+    vst       U0,     P1,    0x00
+    vst       U1,     P1,    0x10
+    vst       U2,     P1,    0x20
+    vst       U3,     P1,    0x30
+    vst       U4,     P1,    0x40
+    vst       U5,     P1,    0x50
+    vst       U6,     P1,    0x60
+    vst       U7,     P1,    0x70
+
+    vld       U0,     S3,    0x00
+    vld       U1,     S3,    0x10
+    vld       U2,     S3,    0x20
+    vld       U3,     S3,    0x30
+    vld       U4,     S4,    0x00
+    vld       U5,     S4,    0x10
+    vld       U6,     S4,    0x20
+    vld       U7,     S4,    0x30
+
+    vst       U0,     P1,    0x80
+    vst       U1,     P1,    0x90
+    vst       U2,     P1,    0xa0
+    vst       U3,     P1,    0xb0
+    vst       U4,     P1,    0xc0
+    vst       U5,     P1,    0xd0
+    vst       U6,     P1,    0xe0
+    vst       U7,     P1,    0xf0
+
+    vld       U0,     S5,    0x00
+    vld       U1,     S5,    0x10
+    vld       U2,     S5,    0x20
+    vld       U3,     S5,    0x30
+    vld       U4,     S6,    0x00
+    vld       U5,     S6,    0x10
+    vld       U6,     S6,    0x20
+    vld       U7,     S6,    0x30
+
+    vst       U0,     P1,    0x100
+    vst       U1,     P1,    0x110
+    vst       U2,     P1,    0x120
+    vst       U3,     P1,    0x130
+    vst       U4,     P1,    0x140
+    vst       U5,     P1,    0x150
+    vst       U6,     P1,    0x160
+    vst       U7,     P1,    0x170
+
+    vld       U0,     S7,    0x00
+    vld       U1,     S7,    0x10
+    vld       U2,     S7,    0x20
+    vld       U3,     S7,    0x30
+    vld       U4,     S8,    0x00
+    vld       U5,     S8,    0x10
+    vld       U6,     S8,    0x20
+    vld       U7,     S8,    0x30
+
+    vst       U0,     P1,    0x180
+    vst       U1,     P1,    0x190
+    vst       U2,     P1,    0x1a0
+    vst       U3,     P1,    0x1b0
+    vst       U4,     P1,    0x1c0
+    vst       U5,     P1,    0x1d0
+    vst       U6,     P1,    0x1e0
+    vst       U7,     P1,    0x1f0
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     S3,     S3,    0x40
+    addi.d     S4,     S4,    0x40
+    addi.d     S5,     S5,    0x40
+    addi.d     S6,     S6,    0x40
+    addi.d     S7,     S7,    0x40
+    addi.d     S8,     S8,    0x40
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_I1
+.L_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N3
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+    vld       U4,     S3,    0x00
+    vld       U5,     S3,    0x10
+    vld       U6,     S4,    0x00
+    vld       U7,     S4,    0x10
+
+    vst       U0,     P2,    0x00
+    vst       U1,     P2,    0x10
+    vst       U2,     P2,    0x20
+    vst       U3,     P2,    0x30
+    vst       U4,     P2,    0x40
+    vst       U5,     P2,    0x50
+    vst       U6,     P2,    0x60
+    vst       U7,     P2,    0x70
+
+    vld       U0,     S5,    0x00
+    vld       U1,     S5,    0x10
+    vld       U2,     S6,    0x00
+    vld       U3,     S6,    0x10
+    vld       U4,     S7,    0x00
+    vld       U5,     S7,    0x10
+    vld       U6,     S8,    0x00
+    vld       U7,     S8,    0x10
+
+    vst       U0,     P2,    0x80
+    vst       U1,     P2,    0x90
+    vst       U2,     P2,    0xa0
+    vst       U3,     P2,    0xb0
+    vst       U4,     P2,    0xc0
+    vst       U5,     P2,    0xd0
+    vst       U6,     P2,    0xe0
+    vst       U7,     P2,    0xf0
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    addi.d     S5,     S5,    0x20
+    addi.d     S6,     S6,    0x20
+    addi.d     S7,     S7,    0x20
+    addi.d     S8,     S8,    0x20
+    addi.d     P2,     P2,    0x100
+
+.L_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N1
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S2,    0x00
+    vld       U2,     S3,    0x00
+    vld       U3,     S4,    0x00
+    vld       U4,     S5,    0x00
+    vld       U5,     S6,    0x00
+    vld       U6,     S7,    0x00
+    vld       U7,     S8,    0x00
+
+    vst       U0,     P3,    0x00
+    vst       U1,     P3,    0x10
+    vst       U2,     P3,    0x20
+    vst       U3,     P3,    0x30
+    vst       U4,     P3,    0x40
+    vst       U5,     P3,    0x50
+    vst       U6,     P3,    0x60
+    vst       U7,     P3,    0x70
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     S5,     S5,    0x10
+    addi.d     S6,     S6,    0x10
+    addi.d     S7,     S7,    0x10
+    addi.d     S8,     S8,    0x10
+    addi.d     P3,     P3,    0x80
+
+.L_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+    fld.d      F2,     S3,    0x00
+    fld.d      F3,     S4,    0x00
+    fld.d      F4,     S5,    0x00
+    fld.d      F5,     S6,    0x00
+    fld.d      F6,     S7,    0x00
+    fld.d      F7,     S8,    0x00
+
+    fst.d      F0,     P4,    0x00
+    fst.d      F1,     P4,    0x08
+    fst.d      F2,     P4,    0x10
+    fst.d      F3,     P4,    0x18
+    fst.d      F4,     P4,    0x20
+    fst.d      F5,     P4,    0x28
+
+    fst.d      F6,     P4,    0x30
+    fst.d      F7,     P4,    0x38
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     S5,     S5,    0x08
+    addi.d     S6,     S6,    0x08
+    addi.d     S7,     S7,    0x08
+    addi.d     S8,     S8,    0x08
+    addi.d     P4,     P4,    0x40
+
+.L_N0:
+    blt        ZERO,   J,     .L_J1
+.L_M7:
+    andi       J,      M,     0x04
+    beq        ZERO,   J,     .L_M3
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S0,     S3,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x100
+
+    srai.d     I,      N,     0x03
+    beq        ZERO,   I,     .L_4N7
+.L_4I1: /* I-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+    vld       U4,     S2,    0x00
+    vld       U5,     S2,    0x10
+    vld       U6,     S2,    0x20
+    vld       U7,     S2,    0x30
+
+    vst       U0,     P1,    0x00
+    vst       U1,     P1,    0x10
+    vst       U2,     P1,    0x20
+    vst       U3,     P1,    0x30
+    vst       U4,     P1,    0x40
+    vst       U5,     P1,    0x50
+    vst       U6,     P1,    0x60
+    vst       U7,     P1,    0x70
+
+    vld       U0,     S3,    0x00
+    vld       U1,     S3,    0x10
+    vld       U2,     S3,    0x20
+    vld       U3,     S3,    0x30
+    vld       U4,     S4,    0x00
+    vld       U5,     S4,    0x10
+    vld       U6,     S4,    0x20
+    vld       U7,     S4,    0x30
+
+    vst       U0,     P1,    0x80
+    vst       U1,     P1,    0x90
+    vst       U2,     P1,    0xa0
+    vst       U3,     P1,    0xb0
+    vst       U4,     P1,    0xc0
+    vst       U5,     P1,    0xd0
+    vst       U6,     P1,    0xe0
+    vst       U7,     P1,    0xf0
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     S3,     S3,    0x40
+    addi.d     S4,     S4,    0x40
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_4I1
+.L_4N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_4N3
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+    vld       U4,     S3,    0x00
+    vld       U5,     S3,    0x10
+    vld       U6,     S4,    0x00
+    vld       U7,     S4,    0x10
+
+    vst       U0,     P2,    0x00
+    vst       U1,     P2,    0x10
+    vst       U2,     P2,    0x20
+    vst       U3,     P2,    0x30
+    vst       U4,     P2,    0x40
+    vst       U5,     P2,    0x50
+    vst       U6,     P2,    0x60
+    vst       U7,     P2,    0x70
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    addi.d     P2,     P2,    0x80
+
+.L_4N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_4N1
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S2,    0x00
+    vld       U2,     S3,    0x00
+    vld       U3,     S4,    0x00
+
+    vst       U0,     P3,    0x00
+    vst       U1,     P3,    0x10
+    vst       U2,     P3,    0x20
+    vst       U3,     P3,    0x30
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     P3,     P3,    0x40
+
+.L_4N1:
+    andi        I,      N,     0x01
+    beq         ZERO,   I,     .L_M3
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+    fld.d      F2,     S3,    0x00
+    fld.d      F3,     S4,    0x00
+
+    fst.d      F0,     P4,    0x00
+    fst.d      F1,     P4,    0x08
+    fst.d      F2,     P4,    0x10
+    fst.d      F3,     P4,    0x18
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     P4,     P4,    0x20
+.L_M3:
+    andi       J,      M,     0x02
+    beq        ZERO,   J,     .L_M1
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S0,     S0,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x80
+
+    srai.d     I,      N,     0x03
+    beq        ZERO,   I,     .L_2N7
+.L_2I1: /* I-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+    vld       U4,     S2,    0x00
+    vld       U5,     S2,    0x10
+    vld       U6,     S2,    0x20
+    vld       U7,     S2,    0x30
+
+    vst       U0,     P1,    0x00
+    vst       U1,     P1,    0x10
+    vst       U2,     P1,    0x20
+    vst       U3,     P1,    0x30
+    vst       U4,     P1,    0x40
+    vst       U5,     P1,    0x50
+    vst       U6,     P1,    0x60
+    vst       U7,     P1,    0x70
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_2I1
+.L_2N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_2N3
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+
+    vst       U0,     P2,    0x00
+    vst       U1,     P2,    0x10
+    vst       U2,     P2,    0x20
+    vst       U3,     P2,    0x30
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     P2,     P2,    0x40
+
+.L_2N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_2N1
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S2,    0x00
+
+    vst       U0,     P3,    0x00
+    vst       U1,     P3,    0x10
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     P3,     P3,    0x20
+
+.L_2N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M1
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+
+    fst.d      F0,     P4,    0x00
+    fst.d      F1,     P4,    0x08
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     P4,     P4,    0x10
+.L_M1:
+    andi       J,      M,     0x01
+    beq        ZERO,   J,     .L_M0
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x40
+
+    srai.d     I,      N,     0x03
+    beq        ZERO,   I,     .L_1N7
+.L_1I1: /* I-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+
+    vst       U0,     P1,    0x00
+    vst       U1,     P1,    0x10
+    vst       U2,     P1,    0x20
+    vst       U3,     P1,    0x30
+
+    addi.d     S1,     S1,    0x40
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_1I1
+
+.L_1N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_1N3
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vst       U0,     P2,    0x00
+    vst       U1,     P2,    0x10
+
+    addi.d     S1,     S1,    0x20
+    addi.d     P2,     P2,    0x20
+
+.L_1N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_1N1
+
+    vld       U0,     S1,    0x00
+    vst       U0,     P3,    0x00
+
+    addi.d     S1,     S1,    0x10
+    addi.d     P3,     P3,    0x10
+
+.L_1N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M0
+
+    fld.d      F0,     S1,    0x00
+
+    fst.d      F0,     P4,    0x00
+
+    addi.d     S1,     S1,    0x08
+    addi.d     P4,     P4,    0x08
+.L_M0:
+    pop_if_used 24, 8
+    jirl       $r0,    $r1,   0x00
+    EPILOGUE
diff --git a/kernel/loongarch64/dnrm2_lasx.S b/kernel/loongarch64/dnrm2_lasx.S
new file mode 100644
index 000000000..5a6f7cf1e
--- /dev/null
+++ b/kernel/loongarch64/dnrm2_lasx.S
@@ -0,0 +1,257 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define XX     $r19
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+
+/* Don't change following FR unless you know the effects. */
+#define VX0    $xr15
+#define VX1    $xr16
+#define VM0    $xr17
+#define VM1    $xr18
+#define VM2    $xr13
+#define VM3    $xr14
+#define res1   $xr19
+#define res2   $xr20
+#define VALPHA $xr21
+#define INF    $f23
+#define a1     $f22
+#define max    $f17
+#define ALPHA  $f12
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    xvxor.v   res1, res1, res1
+    xvxor.v   res2, res2, res2
+    xvxor.v VM0, VM0, VM0
+    bge $r0, N, .L999
+    beq $r0, INCX, .L999
+    move  XX, X
+    // Init INF
+    addi.d  TEMP, $r0,  0x7FF
+    slli.d  TEMP, TEMP, 52
+    MTC  INF, TEMP
+    li.d  TEMP, SIZE
+    slli.d INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0,  I, .L97
+    .align 3
+
+.L10:
+    xvld VX0, X, 0
+    xvld VX1, X, 4 * SIZE
+    xvfmaxa.d VM1, VX1, VX0
+    xvfmaxa.d VM0, VM0, VM1
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    b .L96
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L97
+    .align 3
+
+.L21:
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t3, 2
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t2, 1
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t3, 2
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    b .L96
+    .align 3
+
+.L96:
+    xvpickve.d VX0, VM0, 1
+    xvpickve.d VX1, VM0, 2
+    xvpickve.d VM3, VM0, 3
+    fmaxa.d $f17, $f17, $f14
+    fmaxa.d $f17, $f17, $f15
+    fmaxa.d $f17, $f17, $f16
+    .align 3
+
+.L97:
+    andi I, N, 7
+    bge $r0, I, .L99
+    .align 3
+
+.L98:
+    xvld VX1, X, 0
+    xvfmaxa.d VM0, VM0, VX1
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L98
+    .align 3
+
+.L99:
+    fabs.d  max, max
+    lu12i.w  TEMP, 0x3f800 // 1
+    movgr2fr.d  a1, $r0
+    movgr2fr.w  ALPHA, TEMP
+    CMPEQ  $fcc0, max, a1
+    fcvt.d.s  ALPHA, ALPHA
+    bcnez  $fcc0, .L999
+    fdiv.d  ALPHA, ALPHA, max
+    CMPEQ  $fcc0, INF, ALPHA
+    bcnez  $fcc0, .L999
+    movfr2gr.d TEMP, ALPHA
+    xvreplgr2vr.d VALPHA, TEMP
+
+.L100:
+    li.d TEMP, SIZE
+    bne INCX, TEMP, .L120
+    srai.d  I, N, 3
+    bge $r0, I, .L997
+    .align 3
+
+.L110:
+    xvld VX0, XX, 0
+    xvld VX1, XX, 4 * SIZE
+    xvfmul.d VM2, VX0, VALPHA
+    xvfmul.d VM3, VX1, VALPHA
+    xvfmadd.d res1, VM2, VM2, res1
+    xvfmadd.d res2, VM3, VM3, res2
+    addi.d XX, XX, 8 * SIZE
+    addi.d I, I, -1
+    blt $r0, I, .L110
+    b .L996
+    .align 3
+
+.L120:
+    srai.d I, N, 3
+    bge $r0, I, .L997
+
+.L121:
+    ld.d t1, XX,  0
+    add.d XX, XX, INCX
+    ld.d t2, XX, 0
+    add.d XX, XX, INCX
+    ld.d t3, XX, 0
+    add.d XX, XX, INCX
+    ld.d t4, XX, 0
+    add.d XX, XX, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, XX, 0
+    add.d XX, XX, INCX
+    ld.d t2, XX, 0
+    add.d XX, XX, INCX
+    ld.d t3, XX, 0
+    add.d XX, XX, INCX
+    ld.d t4, XX, 0
+    add.d XX, XX, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VM2, VX0, VALPHA
+    xvfmul.d VM3, VX1, VALPHA
+    xvfmadd.d res1, VM2, VM2, res1
+    xvfmadd.d res2, VM3, VM3, res2
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L996:
+    xvfadd.d res1, res1, res2
+    xvpickve.d VX0, res1, 1
+    xvpickve.d VX1, res1, 2
+    xvpickve.d VM2, res1, 3
+    fadd.d $f19, $f19, $f15
+    fadd.d $f19, $f19, $f16
+    fadd.d $f19, $f19, $f13
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.d $f15, XX, 0
+    addi.d I, I, -1
+    fmul.d $f15, $f15, ALPHA
+    fmadd.d $f19, $f15, $f15, $f19
+    add.d XX, XX , INCX
+    blt $r0, I, .L998
+
+.L999:
+    fsqrt.d $f19, $f19
+    fmul.d $f0, max, $f19
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dnrm2_lsx.S b/kernel/loongarch64/dnrm2_lsx.S
new file mode 100644
index 000000000..fce4260e2
--- /dev/null
+++ b/kernel/loongarch64/dnrm2_lsx.S
@@ -0,0 +1,268 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define XX     $r19
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+
+/* Don't change following FR unless you know the effects. */
+#define VX0    $vr15
+#define VX1    $vr16
+#define VM0    $vr17
+#define VM1    $vr18
+#define VM2    $vr13
+#define VM3    $vr14
+#define res1   $vr19
+#define res2   $vr20
+#define VALPHA $vr21
+#define INF    $f23
+#define a1     $f22
+#define max    $f17
+#define ALPHA  $f12
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    vxor.v   res1, res1, res1
+    vxor.v   res2, res2, res2
+    vxor.v VM0, VM0, VM0
+    bge $r0, N, .L999
+    beq $r0, INCX, .L999
+    move  XX, X
+    // Init INF
+    addi.d  TEMP, $r0,  0x7FF
+    slli.d  TEMP, TEMP, 52
+    MTC  INF, TEMP
+    li.d  TEMP, SIZE
+    slli.d INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+
+    bge $r0,  I, .L97
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmaxa.d VM1, VX1, VX0
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfmaxa.d VM2, VX1, VX0
+    vfmaxa.d VM3, VM1, VM2
+    vfmaxa.d VM0, VM0, VM3
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    b .L96
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L97
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM3, VM1, VM2
+    vfmaxa.d VM0, VM0, VM3
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    b .L96
+    .align 3
+
+.L96:
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    vfmaxa.d VM0, VX0, VX1
+    .align 3
+
+.L97:
+    andi I, N, 7
+    bge $r0, I, .L99
+    .align 3
+
+.L98:
+    vld VX1, X, 0
+    vfmaxa.d VM0, VM0, VX1
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L98
+    .align 3
+
+.L99:
+    fabs.d  max, max
+    lu12i.w  TEMP, 0x3f800 // 1
+    movgr2fr.d  a1, $r0
+    movgr2fr.w  ALPHA, TEMP
+    CMPEQ  $fcc0, max, a1
+    fcvt.d.s  ALPHA, ALPHA
+    bcnez  $fcc0, .L999
+    fdiv.d  ALPHA, ALPHA, max
+    CMPEQ  $fcc0, INF, ALPHA
+    bcnez  $fcc0, .L999
+    movfr2gr.d TEMP, ALPHA
+    vreplgr2vr.d VALPHA, TEMP
+
+.L100:
+    li.d TEMP, SIZE
+    bne INCX, TEMP, .L120
+    srai.d  I, N, 3
+    bge $r0, I, .L997
+    .align 3
+
+.L110:
+    vld VX0, XX, 0 * SIZE
+    vld VX1, XX, 2 * SIZE
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    vld VX0, XX, 4 * SIZE
+    vld VX1, XX, 6 * SIZE
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    addi.d XX, XX, 8 * SIZE
+    addi.d I, I, -1
+    blt $r0, I, .L110
+    b .L996
+    .align 3
+
+.L120:
+    srai.d I, N, 3
+    bge $r0, I, .L997
+    .align 3
+
+.L121:
+    ld.d t1, XX,  0 * SIZE
+    add.d XX, XX, INCX
+    ld.d t2, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    ld.d t3, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    ld.d t4, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmul.d VM2, VX0, VALPHA
+    ld.d t1, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    vfmul.d VM3, VX1, VALPHA
+    ld.d t2, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    ld.d t3, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    ld.d t4, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L996:
+    vfadd.d res1, res1, res2
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.d $f15, XX, 0 * SIZE
+    addi.d I, I, -1
+    fmul.d $f15, $f15, ALPHA
+    fmadd.d $f19, $f15, $f15, $f19
+    add.d XX, XX , INCX
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    fsqrt.d $f19, $f19
+    fmul.d $f0, max, $f19
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dot_lasx.S b/kernel/loongarch64/dot_lasx.S
new file mode 100644
index 000000000..0715b6311
--- /dev/null
+++ b/kernel/loongarch64/dot_lasx.S
@@ -0,0 +1,368 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r17
+#define TEMP   $r18
+
+/* Don't change following FR unless you know the effects. */
+#define s1     $f8
+#define s2     $f9
+#define a1     $f10
+#define b1     $f11
+
+PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT     N,      0(N)
+    LDINT     INCX,   0(INCX)
+    LDINT     INCY,   0(INCY)
+#endif
+
+    /* init $f8 and $f9 to zero */
+    SUB       s1,     s1,      s1
+    SUB       s2,     s2,      s2
+    slli.d    INCX,   INCX,    BASE_SHIFT
+    li.d      TEMP,   SIZE
+    slli.d    INCY,   INCY,    BASE_SHIFT
+    bge       $r0,    N,       .L999
+    bne       INCX,   TEMP,    .L20   /* inc_x=1 */
+    bne       INCY,   TEMP,    .L20   /* inc_y=1 */
+
+    /* !((inc_x == 1) && (inc_y == 1)) */
+
+    /* init $xr8 and $xr9 to zero */
+#ifdef DOUBLE
+    xvldrepl.d $xr0,  X,       0
+#else
+    xvldrepl.w $xr0,  X,       0
+#endif
+#ifdef DSDOT
+    xvfcvtl.d.s       $xr0,    $xr0
+    xvfsub.d  $xr8,   $xr0,    $xr0
+    xvfsub.d  $xr9,   $xr0,    $xr0
+#else
+    XVFSUB    $xr8,   $xr0,    $xr0
+    XVFSUB    $xr9,   $xr0,    $xr0
+#endif
+
+#ifdef DOUBLE
+    srai.d    I,      N,       4
+#else
+    srai.d    I,      N,       5
+#endif
+    bge       $r0,    I,       .L12   /* FLOAT: <32 ; DOUBLE: <16 */
+    .align  3
+.L11:
+    /* FLOAT: 32~ ; DOUBLE: 16~ */
+    xvld      $xr0,   X,       0
+    xvld      $xr1,   X,       32
+    xvld      $xr2,   X,       64
+    xvld      $xr3,   X,       96
+    xvld      $xr4,   Y,       0
+    xvld      $xr5,   Y,       32
+    xvld      $xr6,   Y,       64
+    xvld      $xr7,   Y,       96
+    addi.w    I,      I,       -1
+    addi.d    X,      X,       128
+    addi.d    Y,      Y,       128
+#ifdef DSDOT
+    xvfcvtl.d.s       $xr10,   $xr0
+    xvfcvtl.d.s       $xr11,   $xr4
+    xvfcvth.d.s       $xr12,   $xr0
+    xvfcvth.d.s       $xr13,   $xr4
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+    xvfcvtl.d.s       $xr10,   $xr1
+    xvfcvtl.d.s       $xr11,   $xr5
+    xvfcvth.d.s       $xr12,   $xr1
+    xvfcvth.d.s       $xr13,   $xr5
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+    xvfcvtl.d.s       $xr10,   $xr2
+    xvfcvtl.d.s       $xr11,   $xr6
+    xvfcvth.d.s       $xr12,   $xr2
+    xvfcvth.d.s       $xr13,   $xr6
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+    xvfcvtl.d.s       $xr10,   $xr3
+    xvfcvtl.d.s       $xr11,   $xr7
+    xvfcvth.d.s       $xr12,   $xr3
+    xvfcvth.d.s       $xr13,   $xr7
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+#else
+    XVFMADD   $xr8,   $xr0,    $xr4,   $xr8
+    XVFMADD   $xr9,   $xr1,    $xr5,   $xr9
+    XVFMADD   $xr8,   $xr2,    $xr6,   $xr8
+    XVFMADD   $xr9,   $xr3,    $xr7,   $xr9
+#endif
+    bnez      I,      .L11
+    .align  3
+.L12:
+#ifdef DOUBLE
+    andi      I,      N,       0xf
+    srai.d    I,      I,       2
+#else
+    andi      I,      N,       0x1f
+    srai.d    I,      I,       3
+#endif
+    bge       $r0,    I,       .L14   /* DOUBLE: <4 ; FLOAT: <8 */
+    .align  3
+.L13:
+    /* FLOAT: 8~31 ; DOUBLE: 4~15 */
+    xvld      $xr0,   X,       0
+    xvld      $xr4,   Y,       0
+    addi.w    I,      I,       -1
+    addi.d    X,      X,       32
+    addi.d    Y,      Y,       32
+#ifdef DSDOT
+    xvfcvtl.d.s       $xr10,   $xr0
+    xvfcvtl.d.s       $xr11,   $xr4
+    xvfcvth.d.s       $xr12,   $xr0
+    xvfcvth.d.s       $xr13,   $xr4
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+#else
+    XVFMADD   $xr8,   $xr0,    $xr4,   $xr8
+#endif
+    bnez      I,      .L13
+    .align  3
+.L14:
+    /* store dot in s1 $f8 */
+#ifdef DSDOT
+    xvfadd.d  $xr8,   $xr8,    $xr9
+    fsub.s    s2,     s2,      s2,  /* set s2 to 0.0 */
+    xvpermi.q $xr0,   $xr8,    0x1
+    vfadd.d   $vr8,   $vr8,    $vr0
+    vpackod.d $vr0,   $vr8,    $vr8
+    vfadd.d   $vr8,   $vr8,    $vr0
+#else
+    XVFADD    $xr8,   $xr8,    $xr9
+    SUB       s2,     s2,      s2   /* set s2 to 0.0 */
+    xvpermi.q $xr0,   $xr8,    0x1
+    VFADD     $vr8,   $vr8,    $vr0
+    vpackod.d $vr0,   $vr8,    $vr8
+#ifdef DOUBLE
+    VFADD     $vr8,   $vr8,    $vr0
+#else
+    VFADD     $vr8,   $vr8,    $vr0
+    vpackod.w $vr0,   $vr8,    $vr8
+    VFADD     $vr8,   $vr8,    $vr0
+#endif /* defined DOUBLE */
+#endif /* defined DSDOT */
+    .align  3
+.L15:
+#ifdef DOUBLE
+    andi      I,      N,       0x3
+#else
+    andi      I,      N,       0x7
+#endif
+    bge       $r0,    I,       .L999  /* =0 */
+   .align  3
+.L16:
+    /* FLOAT: 1~7 ; DOUBLE: 1~3 */
+    LD        a1,     X,       0
+    LD        b1,     Y,       0
+#ifdef DSDOT
+    fcvt.d.s  a1,     a1
+    fcvt.d.s  b1,     b1
+    fmadd.d   s1,     b1,      a1,     s1
+#else
+    MADD      s1,     b1,      a1,     s1
+#endif
+    addi.d    I,      I,       -1
+    addi.d    X,      X,       SIZE
+    addi.d    Y,      Y,       SIZE
+    bnez      I,      .L16
+    b         .L999
+    .align 3
+
+.L20:
+/* !((inc_x == 1) && (inc_y == 1)) */
+    srai.d    I,      N,       3
+#ifdef F_INTERFACE
+    bgez    INCX, .L21
+    addi.d  TEMP, N, -1
+    mult    TEMP, INCX
+    mflo    TEMP
+    dsub    X, X, TEMP
+    .align 3
+
+.L21:
+    bgez    INCY, .L22
+    addi.d  TEMP, N, -1
+    mult    TEMP, INCY
+    mflo    TEMP
+    dsub    Y, Y, TEMP
+    .align 3
+
+.L22:
+#endif
+    bge $r0,    I, .L25  /* <8 */
+    .align 3
+
+.L23:
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+    addi.d  I, I, -1
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+    blt $r0,    I, .L23
+    .align 3
+
+.L25:
+    andi    I,  N, 7
+    bge $r0,    I, .L999
+    .align  3
+
+.L26:
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+    addi.d  I, I, -1
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+    blt $r0,    I, .L26
+    .align 3
+
+.L999:
+#ifdef DSDOT
+    fadd.d    $f0,    s1,      s2
+#else
+    ADD       $f0,    s1,      s2
+#endif
+    move      $r4,    $r17
+    jirl      $r0,    $r1,     0x0
+
+EPILOGUE
diff --git a/kernel/loongarch64/dot_lsx.S b/kernel/loongarch64/dot_lsx.S
new file mode 100644
index 000000000..f7f613553
--- /dev/null
+++ b/kernel/loongarch64/dot_lsx.S
@@ -0,0 +1,364 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r17
+#define TEMP   $r18
+
+/* Don't change following FR unless you know the effects. */
+#define s1     $f8
+#define s2     $f9
+#define a1     $f10
+#define b1     $f11
+
+PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT     N,      0(N)
+    LDINT     INCX,   0(INCX)
+    LDINT     INCY,   0(INCY)
+#endif
+
+    /* init $f8 and $f9 to zero */
+    SUB       s1,     s1,      s1
+    SUB       s2,     s2,      s2
+    slli.d    INCX,   INCX,    BASE_SHIFT
+    li.d      TEMP,   SIZE
+    slli.d    INCY,   INCY,    BASE_SHIFT
+    bge       $r0,    N,       .L999
+    bne       INCX,   TEMP,    .L20   /* inc_x=1 */
+    bne       INCY,   TEMP,    .L20   /* inc_y=1 */
+
+    /* !((inc_x == 1) && (inc_y == 1)) */
+
+    /* init $vr8 and $vr9 to zero */
+#ifdef DOUBLE
+    vldrepl.d  $vr0,  X,       0
+#else
+    vldrepl.w  $vr0,  X,       0
+#endif
+#ifdef DSDOT
+    vfcvtl.d.s        $vr0,    $vr0
+    vfsub.d   $vr8,   $vr0,    $vr0
+    vfsub.d   $vr9,   $vr0,    $vr0
+#else
+    VFSUB     $vr8,   $vr0,    $vr0
+    VFSUB     $vr9,   $vr0,    $vr0
+#endif
+
+#ifdef DOUBLE
+    srai.d    I,      N,       3
+#else
+    srai.d    I,      N,       4
+#endif
+    bge       $r0,    I,       .L12   /* FLOAT: <16 ; DOUBLE: <8 */
+    .align  3
+.L11:
+    /* FLOAT: 16~ ; DOUBLE: 8~ */
+    vld       $vr0,   X,       0
+    vld       $vr1,   X,       16
+    vld       $vr2,   X,       32
+    vld       $vr3,   X,       48
+    vld       $vr4,   Y,       0
+    vld       $vr5,   Y,       16
+    vld       $vr6,   Y,       32
+    vld       $vr7,   Y,       48
+    addi.w    I,      I,       -1
+    addi.d    X,      X,       64
+    addi.d    Y,      Y,       64
+#ifdef DSDOT
+    vfcvtl.d.s        $vr10,   $vr0
+    vfcvtl.d.s        $vr11,   $vr4
+    vfcvth.d.s        $vr12,   $vr0
+    vfcvth.d.s        $vr13,   $vr4
+    vfmadd.d $vr8,    $vr10,   $vr12,  $vr8
+    vfmadd.d $vr9,    $vr11,   $vr13,  $vr9
+    vfcvtl.d.s        $vr10,   $vr1
+    vfcvtl.d.s        $vr11,   $vr5
+    vfcvth.d.s        $vr12,   $vr1
+    vfcvth.d.s        $vr13,   $vr5
+    vfmadd.d $vr8,    $vr10,   $vr12,  $vr8
+    vfmadd.d $vr9,    $vr11,   $vr13,  $vr9
+    vfcvtl.d.s        $vr10,   $vr2
+    vfcvtl.d.s        $vr11,   $vr6
+    vfcvth.d.s        $vr12,   $vr2
+    vfcvth.d.s        $vr13,   $vr6
+    vfmadd.d $vr8,    $vr10,   $vr12,  $vr8
+    vfmadd.d $vr9,    $vr11,   $vr13,  $vr9
+    vfcvtl.d.s        $vr10,   $vr3
+    vfcvtl.d.s        $vr11,   $vr7
+    vfcvth.d.s        $vr12,   $vr3
+    vfcvth.d.s        $vr13,   $vr7
+    vfmadd.d $vr8,    $vr10,   $vr12,  $vr8
+    vfmadd.d $vr9,    $vr11,   $vr13,  $vr9
+#else
+    VFMADD    $vr8,   $vr0,    $vr4,   $vr8
+    VFMADD    $vr9,   $vr1,    $vr5,   $vr9
+    VFMADD    $vr8,   $vr2,    $vr6,   $vr8
+    VFMADD    $vr9,   $vr3,    $vr7,   $vr9
+#endif
+    bnez      I,      .L11
+    .align  3
+.L12:
+#ifdef DOUBLE
+    andi      I,      N,       0x7
+    srai.d    I,      I,       1
+#else
+    andi      I,      N,       0xf
+    srai.d    I,      I,       2
+#endif
+    bge       $r0,    I,       .L14   /* DOUBLE: <2 ; FLOAT: <4 */
+    .align  3
+.L13:
+    /* FLOAT: 4~15 ; DOUBLE: 2~7 */
+    vld       $vr0,   X,       0
+    vld       $vr4,   Y,       0
+    addi.w    I,      I,       -1
+    addi.d    X,      X,       16
+    addi.d    Y,      Y,       16
+#ifdef DSDOT
+    vfcvtl.d.s        $vr10,   $vr0
+    vfcvtl.d.s        $vr11,   $vr4
+    vfcvth.d.s        $vr12,   $vr0
+    vfcvth.d.s        $vr13,   $vr4
+    vfmadd.d  $vr8,   $vr10,   $vr12,  $vr8
+    vfmadd.d  $vr9,   $vr11,   $vr13,  $vr9
+#else
+    VFMADD    $vr8,   $vr0,    $vr4,   $vr8
+#endif
+    bnez      I,      .L13
+    .align  3
+.L14:
+    /* store dot in s1 $f8 */
+#ifdef DSDOT
+    vfadd.d   $vr8,   $vr8,    $vr9
+    fsub.s    s2,     s2,      s2,  /* set s2 to 0.0 */
+    vpackod.d $vr0,   $vr8,    $vr8
+    vfadd.d   $vr8,   $vr8,    $vr0
+#else
+    VFADD     $vr8,   $vr8,    $vr9
+    SUB       s2,     s2,      s2   /* set s2 to 0.0 */
+    vpackod.d $vr0,   $vr8,    $vr8
+#ifdef DOUBLE
+    VFADD     $vr8,   $vr8,    $vr0
+#else
+    VFADD     $vr8,   $vr8,    $vr0
+    vpackod.w $vr0,   $vr8,    $vr8
+    VFADD     $vr8,   $vr8,    $vr0
+#endif /* defined DOUBLE */
+#endif /* defined DSDOT */
+    .align  3
+.L15:
+#ifdef DOUBLE
+    andi      I,      N,       0x1
+#else
+    andi      I,      N,       0x3
+#endif
+    bge       $r0,    I,       .L999  /* =0 */
+   .align  3
+.L16:
+    /* DOUBLE: 1 ; FLOAT: 1~3 */
+    LD        a1,     X,       0
+    LD        b1,     Y,       0
+#ifdef DSDOT
+    fcvt.d.s  a1,     a1
+    fcvt.d.s  b1,     b1
+    fmadd.d   s1,     b1,      a1,     s1
+#else
+    MADD      s1,     b1,      a1,     s1
+#endif
+    addi.d    I,      I,       -1
+    addi.d    X,      X,       SIZE
+    addi.d    Y,      Y,       SIZE
+    bnez      I,      .L16
+    b         .L999
+    .align 3
+
+.L20:
+/* !((inc_x == 1) && (inc_y == 1)) */
+    srai.d    I,      N,       3
+#ifdef F_INTERFACE
+    bgez    INCX, .L21
+    addi.d  TEMP, N, -1
+    mult    TEMP, INCX
+    mflo    TEMP
+    dsub    X, X, TEMP
+    .align 3
+
+.L21:
+    bgez    INCY, .L22
+    addi.d  TEMP, N, -1
+    mult    TEMP, INCY
+    mflo    TEMP
+    dsub    Y, Y, TEMP
+    .align 3
+
+.L22:
+#endif
+    bge $r0,    I, .L25  /* <8 */
+    .align 3
+
+.L23:
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+    addi.d  I, I, -1
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+    blt $r0,    I, .L23
+    .align 3
+
+.L25:
+    andi    I,  N, 7
+    bge $r0,    I, .L999
+    .align  3
+
+.L26:
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+    addi.d  I, I, -1
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+    blt $r0,    I, .L26
+    .align 3
+
+.L999:
+#ifdef DSDOT
+    fadd.d    $f0,    s1,      s2
+#else
+    ADD       $f0,    s1,      s2
+#endif
+    move      $r4,    $r17
+    jirl      $r0,    $r1,     0x0
+
+EPILOGUE
diff --git a/kernel/loongarch64/dscal_lasx.S b/kernel/loongarch64/dscal_lasx.S
new file mode 100644
index 000000000..153662378
--- /dev/null
+++ b/kernel/loongarch64/dscal_lasx.S
@@ -0,0 +1,194 @@
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define ALPHA $f0
+#define X  $r7
+#define INCX $r8
+#define I $r12
+#define TEMP $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define XX $r16
+#define VX0 $xr12
+#define VX1 $xr13
+#define VT0 $xr14
+#define VT1 $xr15
+#define VALPHA $xr19
+#define a1 $f8
+#define a2 $f23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.d.l a1, a1
+    movgr2fr.d a2, TEMP
+    ffint.d.l a2, a2
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    fcmp.ceq.d $fcc0, ALPHA, a1
+    bcnez $fcc0, .L20   //ALPHA==0
+    fcmp.ceq.d $fcc0, ALPHA, a2
+    bcnez $fcc0, .L999  //ALPHA==1 return
+    srai.d I, N, 3
+    beq INCX, TEMP, .L30    //ALPHA！=0|1 and INCX==1
+    movfr2gr.d TEMP, ALPHA
+    xvreplgr2vr.d VALPHA, TEMP
+    move XX, X
+    .align 3
+
+.L10:
+    bge $r0, I, .L32
+    .align 3
+.L11:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    xvfmul.d VT0, VX0, VALPHA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvfmul.d VT1, VX1, VALPHA
+    xvstelm.d VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT1, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT1, XX, 0, 3
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    b .L32
+    .align 3
+
+.L20:
+    srai.d I, N, 3
+    beq INCX, TEMP, .L24
+    bge $r0, I, .L22
+    .align 3
+
+.L21:
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L23:
+    fst.d a1, X, 0 * SIZE
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L23
+    jirl $r0, $r1, 0
+    .align 3
+
+.L24:
+    bge $r0, I, .L26 /*N<8 INCX==1*/
+    .align 3
+.L25:
+    xvxor.v VX0, VX0, VX0
+    xvst VX0, X, 0 * SIZE
+    xvst VX0, X, 4 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L25
+    .align 3
+
+.L26:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L27:
+    fst.d a1, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, SIZE
+    blt $r0, I, .L27
+    jirl $r0, $r1, 0
+    .align 3
+
+.L30:
+    bge $r0, I, .L32/*N<8 INCX==1*/
+    movfr2gr.d TEMP, ALPHA
+    xvreplgr2vr.d VALPHA , TEMP
+    .align 3
+
+.L31:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VT0, VX0, VALPHA
+    xvfmul.d VT1, VX1, VALPHA
+    addi.d I, I, -1
+    xvst VT0, X, 0 * SIZE
+    xvst VT1, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L31
+    .align 3
+
+.L32:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L33:
+    fld.d a1, X, 0 * SIZE
+    addi.d I, I, -1
+    fmul.d a1, ALPHA, a1
+    fst.d a1, X, 0 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L33
+    jirl $r0, $r1, 0
+    .align 3
+
+.L999:
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dscal_lsx.S b/kernel/loongarch64/dscal_lsx.S
new file mode 100644
index 000000000..55f497752
--- /dev/null
+++ b/kernel/loongarch64/dscal_lsx.S
@@ -0,0 +1,205 @@
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define ALPHA $f0
+#define X  $r7
+#define INCX $r8
+#define I $r12
+#define TEMP $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define XX $r16
+#define VX0 $vr12
+#define VX1 $vr13
+#define VT0 $vr14
+#define VT1 $vr15
+#define VALPHA $vr19
+#define a1 $f8
+#define a2 $f23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.d.l a1, a1
+    movgr2fr.d a2, TEMP
+    ffint.d.l a2, a2
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    fcmp.ceq.d $fcc0, ALPHA, a1
+    bcnez $fcc0, .L20   //ALPHA==0
+    fcmp.ceq.d $fcc0, ALPHA, a2
+    bcnez $fcc0, .L999  //ALPHA==1 return
+    srai.d I, N, 3
+    beq INCX, TEMP, .L30    //ALPHA！=0|1 and INCX==1
+    movfr2gr.d TEMP, ALPHA
+    vreplgr2vr.d VALPHA, TEMP
+    move XX, X
+    .align 3
+
+.L10: //ALPHA！=0|1 and INCX!=1
+    bge $r0, I, .L32
+    .align 3
+
+.L11:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vfmul.d VT0, VX0, VALPHA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vfmul.d VT1, VX1, VALPHA
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vstelm.d VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    vfmul.d VT0, VX0, VALPHA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vfmul.d VT1, VX1, VALPHA
+    vstelm.d VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    b .L32
+    .align 3
+
+.L20:
+    srai.d I, N, 3
+    beq INCX, TEMP, .L24
+    bge $r0, I, .L22
+    .align 3
+
+.L21:
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L23:
+    fst.d a1, X, 0 * SIZE
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L23
+    jirl $r0, $r1, 0
+    .align 3
+
+.L24:
+    bge $r0, I, .L26 /*N<8 INCX==1*/
+    .align 3
+.L25:
+    vxor.v VX0, VX0, VX0
+    vst VX0, X, 0 * SIZE
+    vst VX0, X, 2 * SIZE
+    vst VX0, X, 4 * SIZE
+    vst VX0, X, 6 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L25
+    .align 3
+
+.L26:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L27:
+    fst.d a1, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, SIZE
+    blt $r0, I, .L27
+    jirl $r0, $r1, 0
+    .align 3
+
+.L30:
+    bge $r0, I, .L32/*N<8 INCX==1*/
+    movfr2gr.d TEMP, ALPHA
+    vreplgr2vr.d VALPHA , TEMP
+    .align 3
+
+.L31:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VT0, VX0, VALPHA
+    vfmul.d VT1, VX1, VALPHA
+    vld VX0, X, 4 * SIZE
+    vst VT0, X, 0 * SIZE
+    vst VT1, X, 2 * SIZE
+    vfmul.d VT0, VX0, VALPHA
+    vld VX1, X, 6 * SIZE
+    vst VT0, X, 4 * SIZE
+    vfmul.d VT1, VX1, VALPHA
+    vst VT1, X, 6 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L31
+    .align 3
+
+.L32:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L33:
+    fld.d a1, X, 0 * SIZE
+    addi.d I, I, -1
+    fmul.d a1, ALPHA, a1
+    fst.d a1, X, 0 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L33
+    jirl $r0, $r1, 0
+    .align 3
+
+.L999:
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/iamax_lasx.S b/kernel/loongarch64/iamax_lasx.S
new file mode 100644
index 000000000..090da3004
--- /dev/null
+++ b/kernel/loongarch64/iamax_lasx.S
@@ -0,0 +1,542 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvadd.d VI1, VI1, VINC8
+    xvld VX1, X, 4 * SIZE
+    xvadd.d VI2, VI1, VINC4
+    xvfmaxa.d VM1, VX0, VX1
+    xvfcmp.ceq.d VT0, VX0, VM1
+    addi.d I, I, -1
+    xvbitsel.v VI2, VI2, VI1, VT0
+    xvfmaxa.d VM1, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI2, VI0, VT0
+#else
+    xvld VX0, X, 0 * SIZE
+    addi.d I, I, -1
+    xvadd.w VI1, VI1, VINC8
+    xvfmaxa.s VM1, VX0, VM0
+    xvfcmp.ceq.s VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+#endif
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+#endif
+    XVFMAXA VM1, x1, x2
+    XVCMPEQ VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    XVFMAXA VM0, x3, x4
+    XVCMPEQ VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    XVFMAXA VM0, VM0, VM1
+    XVCMPEQ VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    CMPEQ $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t2, 1
+    xvinsgr2vr.w VM0, t3, 2
+    xvinsgr2vr.w VM0, t4, 3
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 4
+    xvinsgr2vr.w VM0, t2, 5
+    xvinsgr2vr.w VM0, t3, 6
+    xvinsgr2vr.w VM0, t4, 7
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvadd.d VI2, VI1, VINC4
+    xvfmaxa.d VM1, VX0, VX1
+    xvfcmp.ceq.d VT0, VX0, VM1
+    addi.d I, I, -1
+    xvbitsel.v VI2, VI2, VI1, VT0
+    xvfmaxa.d VM1, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI2, VI0, VT0
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvadd.w VI1, VI1, VINC8
+    xvfmaxa.s VM1, VX0, VM0
+    xvfcmp.ceq.s VT0, VM1, VM0
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+#endif
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmaxa.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmaxa.d VM0, x4, x3
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmaxa.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfmaxa.s VM1, x1, x2
+    xvfcmp.ceq.s VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmaxa.s VM0, x3, x4
+    xvfcmp.ceq.s VT0, x3, VM0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfmaxa.s VM0, VM0, VM1
+    xvfcmp.ceq.s VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#endif
+    CMPEQ $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    XVCMPLT  VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    XVCMPLT  VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    XVCMPLT  VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#ifdef DOUBLE
+    movfr2gr.d i0, $f20
+#else
+    fmov.s $f16, $f20
+#endif
+    .align 3
+
+#ifdef DOUBLE
+
+#else
+.L252:
+    xvxor.v VI0, VI0, VI0
+    xvor.v VI0, VI0, VX0
+    fmov.s $f13, $f15
+    xvxor.v VM0, VM0, VM0
+    xvor.v VM0, VM0, VX1
+    xvpickve.w VI1, VI0, 4
+    xvpickve.w VI2, VI0, 5
+    xvpickve.w VI3, VI0, 6
+    xvpickve.w VI4, VI0, 7
+    xvpickve.w x1, VM0, 4
+    xvpickve.w x2, VM0, 5
+    xvpickve.w x3, VM0, 6
+    xvpickve.w x4, VM0, 7
+    xvfmaxa.s VM1, x1, x2
+    xvfcmp.ceq.s VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmaxa.s VM0, x3, x4
+    xvfcmp.ceq.s VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmaxa.s VM0, VM0, VM1
+    xvfcmp.ceq.s VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L262
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L262:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L272
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+.L272:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L282
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L282:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L292
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L292:
+    xvfmaxa.s VM0, VX0, VM0
+    xvfcmp.ceq.s VT0, VM0, VX0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    movfr2gr.s i0, $f20
+#endif
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD  $f9, X, 0
+    addi.d I, I, -1
+    XVFMAXA VM1, x1, VM0
+    XVCMPEQ VT0, VM0, VM1
+    add.d  X, X, INCX
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG     i0,   $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/iamax_lsx.S b/kernel/loongarch64/iamax_lsx.S
new file mode 100644
index 000000000..ce5b3c724
--- /dev/null
+++ b/kernel/loongarch64/iamax_lsx.S
@@ -0,0 +1,482 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VINC2 $vr17
+#define VINC4 $vr18
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L11
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC2, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC4, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 2 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfmaxa.d x1, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x1
+    vbitsel.v x2, VI2, VI1, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI2, VINC2
+    vld VX1, X, 6 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfmaxa.d x3, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x3
+    vbitsel.v x4, VI2, VI1, VT0
+    vfmaxa.d x3, x1, x3
+    vfcmp.ceq.d VT0, x1, x3
+    vbitsel.v x2, x4, x2, VT0
+    vfmaxa.d VM1, VM0, x3
+    vfcmp.ceq.d VT0, VM0, VM1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, x2, VI0, VT0
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    vadd.w VI1, VI1, VINC4
+    vld VX1, X, 4 * SIZE
+    vadd.w VI2, VI1, VINC2
+    vfmaxa.s VM1, VX0, VX1
+    vfcmp.ceq.s VT0, VX0, VM1
+    addi.d I, I, -1
+    vbitsel.v VI2, VI2, VI1, VT0
+    vfmaxa.s VM1, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI2, VI0, VT0
+#endif
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L16
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L17
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmaxa.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC2, VI2, VI1, VT0
+    vfmaxa.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC4, VI4, VI3, VT0
+    vfmaxa.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC4, VINC2, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+#endif
+    .align 3
+
+#ifdef DOUBLE
+.L16:
+    vfmaxa.d VM0, x1, x2
+    vfcmp.ceq.d VT0, x1, VM0
+    vbitsel.v VI0, VI2, VI1, VT0
+    .align 3
+
+.L17:
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L11: //INCX==1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L14
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L13:
+    fld.d $f9, X, 0
+    vfmaxa.d VM1, x1, VM0
+    vfcmp.ceq.d VT0, VM0, VM1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    addi.d I, I, -1
+    addi.d i1, i1, 1
+    addi.d  X, X, SIZE
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L13
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L14:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t2, 1
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+    .align 3
+
+.L24:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI1, VINC4
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfmaxa.d x1, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x1
+    vbitsel.v x2, VI2, VI1, VT0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI2, VINC2
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfmaxa.d x3, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x3
+    vbitsel.v x4, VI2, VI1, VT0
+    vfmaxa.d x3, x1, x3
+    vfcmp.ceq.d VT0, x1, x3
+    vbitsel.v x2, x4, x2, VT0
+    vfmaxa.d VM1, VM0, x3
+    vbitsel.v VM0, VM1, VM0, VT0
+    vfcmp.ceq.d VT0, VM0, VM1
+    vbitsel.v VI0, x2, VI0, VT0
+    addi.d I, I, -1
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+    .align 3
+
+.L26:
+    vfmaxa.d VM0, x1, x2
+    vfcmp.ceq.d VT0, x1, VM0
+    vbitsel.v VI0, VI2, VI1, VT0
+    .align 3
+
+.L27:
+    movfr2gr.d i0, $f20
+    .align 3
+
+#else
+.L20: // INCX!=1
+    move TEMP, X
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC2, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC4, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+    .align 3
+
+.L24:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vadd.w VI1, VI1, VINC4
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vadd.w VI2, VI1, VINC2
+    vfmaxa.s VM1, VX0, VX1
+    vfcmp.ceq.s VT0, VX0, VM1
+    vbitsel.v VI2, VI2, VI1, VT0
+    vfmaxa.s VM1, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    addi.d I, I, -1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI2, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmaxa.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC2, VI2, VI1, VT0
+    vfmaxa.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC4, VI4, VI3, VT0
+    vfmaxa.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC4, VINC2, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    vfcmp.clt.s VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    vfcmp.clt.s VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    vfcmp.clt.s VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.s i0, $f20
+    .align 3
+
+#endif
+.L21: // N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD $f9, X, 0
+    VFMAXA VM1, x1, VM0
+    VCMPEQ VT0, VM0, VM1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    addi.d I, I, -1
+    addi.d i1, i1, 1
+    add.d  X, X, INCX
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG   i0,  $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/iamin_lasx.S b/kernel/loongarch64/iamin_lasx.S
new file mode 100644
index 000000000..6ea117907
--- /dev/null
+++ b/kernel/loongarch64/iamin_lasx.S
@@ -0,0 +1,486 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvadd.d VI1, VI1, VINC8
+    xvld VX1, X, 4 * SIZE
+    xvadd.d VI2, VI1, VINC4
+    xvfmina.d VM1, VX0, VX1
+    xvfcmp.ceq.d VT0, VX0, VM1
+    addi.d I, I, -1
+    xvbitsel.v VI2, VI2, VI1, VT0
+    xvfmina.d VM1, VM0, VM1
+#else
+    addi.d I, I, -1
+    xvadd.w VI2, VI1, VINC8
+    xvfmina.s VM1, VX0, VM0
+#endif
+    XVCMPEQ VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI2, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmina.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, x1, VM1
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v  VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v  VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfmina.s VM1, x1, x2
+    xvfcmp.ceq.s VT0, x1, VM1
+#endif
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    XVFMINA VM0, x4, x3
+    XVCMPEQ VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    XVFMINA VM0, VM0, VM1
+    XVCMPEQ VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+#ifdef DOUBLE
+    addi.d  i0,   i0, 1
+    ld.d    t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w  i0, i0, 1
+    ld.w    t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t2, 1
+    xvinsgr2vr.w VM0, t3, 2
+    xvinsgr2vr.w VM0, t4, 3
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 4
+    xvinsgr2vr.w VM0, t2, 5
+    xvinsgr2vr.w VM0, t3, 6
+    xvinsgr2vr.w VM0, t4, 7
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvadd.d VI2, VI1, VINC4
+    xvfmina.d VM1, VX0, VX1
+    xvfcmp.ceq.d VT0, VX0, VM1
+    xvbitsel.v VI2, VI2, VI1, VT0
+    xvfmina.d VM1, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvadd.w VI2, VI1, VINC8
+    xvfmina.s VM1, VX0, VM0
+    xvfcmp.ceq.s VT0, VM1, VM0
+#endif
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI2, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    XVCMPLT VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    XVCMPLT VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    XVCMPLT VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#ifdef DOUBLE
+    movfr2gr.d i0, $f20
+    .align 3
+#else
+    fmov.s $f16, $f20
+    .align 3
+
+.L252:
+    xvxor.v VI0, VI0, VI0
+    xvor.v VI0, VI0, VX0
+    fmov.s $f13, $f15
+    xvxor.v VM0, VM0, VM0
+    xvor.v VM0, VM0, VX1
+    xvpickve.w VI1, VI0, 4
+    xvpickve.w VI2, VI0, 5
+    xvpickve.w VI3, VI0, 6
+    xvpickve.w VI4, VI0, 7
+    xvpickve.w x1, VM0, 4
+    xvpickve.w x2, VM0, 5
+    xvpickve.w x3, VM0, 6
+    xvpickve.w x4, VM0, 7
+    xvfmina.s VM1, x1, x2
+    xvfcmp.ceq.s VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmina.s VM0, x3, x4
+    xvfcmp.ceq.s VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmina.s VM0, VM0, VM1
+    xvfcmp.ceq.s VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L262
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L262:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L272
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L272:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L282
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L282:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L292
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L292:
+    xvfmina.s VM0, VX0, VM0
+    xvfcmp.ceq.s VT0, VM0, VX0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    movfr2gr.s i0, $f20
+
+#endif
+
+.L21: // N<8
+    andi    I, N, 7
+    bge   $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD $f9, X, 0
+    addi.d I, I, -1
+    XVFMINA    VM1,  x1, VM0
+    XVCMPEQ    VT0, VM0, VM1
+    add.d        X,   X, INCX
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/iamin_lsx.S b/kernel/loongarch64/iamin_lsx.S
new file mode 100644
index 000000000..ce885fd88
--- /dev/null
+++ b/kernel/loongarch64/iamin_lsx.S
@@ -0,0 +1,446 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#ifdef DOUBLE
+#define VINC2 $vr17
+#define VINC4 $vr18
+#else
+#define VINC4 $vr17
+#define VINC8 $vr18
+#endif
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+#ifdef DOUBLE
+    addi.d  i0, i0, 1
+    srai.d   I, N, 3
+    bge $r0, I, .L21
+
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w i0, i0, 1
+    srai.d  I, N, 3
+    bge   $r0, I, .L21
+
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 2 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfmina.d x1, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x1
+    vbitsel.v x2, VI2, VI1, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI2, VINC2
+    vld VX1, X, 6 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfmina.d x3, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x3
+    vbitsel.v x4, VI2, VI1, VT0
+    vfmina.d x3, x1, x3
+    vfcmp.ceq.d VT0, x1, x3
+    addi.d I, I, -1
+    vbitsel.v x2, x4, x2, VT0
+    vfmina.d VM1, VM0, x3
+#else
+    vadd.w VI1, VI1, VINC8
+    vld VX1, X, 4 * SIZE
+    vadd.w VI2, VI1, VINC4
+    vfmina.s VM1, VX0, VX1
+    vfcmp.ceq.s VT0, VX0, VM1
+    addi.d I, I, -1
+    vbitsel.v x2, VI2, VI1, VT0
+    vfmina.s VM1, VM0, VM1
+#endif
+    VCMPEQ VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, x2, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d  x1, VM0, 0
+    vreplvei.d  x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v   VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w  x1, VM0, 0
+    vreplvei.w  x2, VM0, 1
+    vreplvei.w  x3, VM0, 2
+    vreplvei.w  x4, VM0, 3
+    vfmina.s   VM1, x1, x2
+    vfcmp.ceq.s  VT0, VM1, x1
+    vbitsel.v  VINC4, VI2, VI1, VT0
+    vfmina.s     VM0,  x3, x4
+    vfcmp.ceq.s  VT0,  x3, VM0
+    vbitsel.v  VINC8, VI4, VI3, VT0
+    vfmina.s     VM0, VM0, VM1
+    vfcmp.ceq.s  VT0, VM0, VM1
+    vbitsel.v    VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v   VI0, VI0, VI1, VT0
+    b .L26
+#endif
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+#ifdef DOUBLE
+    addi.d    i0,   i0, 1
+    ld.d      t1, TEMP, 0 * SIZE
+    add.d   TEMP, TEMP, INCX
+    vinsgr2vr.d    VM0, t1, 0
+    srai.d     I,    N, 3
+    bge $r0,   I, .L21
+    ld.d      t2, TEMP, 0 * SIZE
+    add.d   TEMP, TEMP, INCX
+
+    vinsgr2vr.d VM0, t2, 1
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w    i0,   i0, 1
+    ld.w      t1, TEMP, 0 * SIZE
+    add.d   TEMP, TEMP, INCX
+    vinsgr2vr.w    VM0, t1, 0
+    srai.d     I,    N, 3
+    bge $r0,   I, .L21
+    ld.w      t2, TEMP, 0 * SIZE
+    add.d   TEMP, TEMP, INCX
+
+    vreplvei.d VI1, VI0, 0
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d  t1, X, 0 * SIZE
+    add.d X,  X, INCX
+    ld.d  t2, X, 0 * SIZE
+    add.d X,  X, INCX
+
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI1, VINC4
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfmina.d x1, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x1
+    vbitsel.v x2, VI2, VI1, VT0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI2, VINC2
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfmina.d x3, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x3
+    vbitsel.v x4, VI2, VI1, VT0
+    vfmina.d x3, x1, x3
+    vfcmp.ceq.d VT0, x1, x3
+    addi.d I, I, -1
+    vbitsel.v x2, x4, x2, VT0
+    vfmina.d VM1, VM0, x3
+    vbitsel.v VM0, VM1, VM0, VT0
+    vfcmp.ceq.d VT0, VM0, VM1
+    vbitsel.v VI0, x2, VI0, VT0
+#else
+    ld.w  t1, X, 0 * SIZE
+    add.d X,  X, INCX
+    ld.w  t2, X, 0 * SIZE
+    add.d X,  X, INCX
+
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vadd.w VI2, VI1, VINC4
+    vfmina.s VM1, VX0, VX1
+    vfcmp.ceq.s VT0, VX0, VM1
+    vbitsel.v VI2, VI2, VI1, VT0
+    vfmina.s VM1, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    addi.d I, I, -1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI2, VI0, VT0
+#endif
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d  x1, VM0, 0
+    vreplvei.d  x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmina.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmina.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmina.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+#endif
+    .align 3
+
+.L26:
+#ifdef DOUBLE
+    vfmina.d VM0, x1, x2
+    vfcmp.ceq.d VT0, x1, VM0
+    vbitsel.v VI0, VI2, VI1, VT0
+    .align 3
+
+.L27:
+    movfr2gr.d i0, $f20
+
+#else
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    vfcmp.clt.s VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    vfcmp.clt.s VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    vfcmp.clt.s VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.s i0, $f20
+#endif
+    .align 3
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD $f9, X, 0
+    addi.d I, I, -1
+    VFMINA VM1, x1, VM0
+    VCMPEQ VT0, VM0, VM1
+    add.d  X, X, INCX
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    addi.d i1, i1, 1
+    MTC $f21, i1
+    blt $r0, I, .L22
+    movfr2gr.s i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/icamax_lasx.S b/kernel/loongarch64/icamax_lasx.S
new file mode 100644
index 000000000..7800cb917
--- /dev/null
+++ b/kernel/loongarch64/icamax_lasx.S
@@ -0,0 +1,562 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define a0 $f12
+#define a1 $f13
+#define s1 $f15
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    xvxor.v VM0, VM0, VM0
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    xvxor.v VI3, VI3, VI3 // 0
+#ifdef DOUBLE
+    li.d I, -1
+    xvreplgr2vr.d VI4, I
+    xvffint.d.l VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 1 //3
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI0, i0, 2 //2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    li.w I, -1
+    xvreplgr2vr.w VI4, I
+    xvffint.s.w VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 2 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //6
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI0, i0, 4 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //4
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvadd.d VI1, VI1, VINC4
+    xvld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmul.d x3, VI4, x1
+    xvfmul.d x4, VI4, x2
+    xvfcmp.clt.d VT0, x1, VI3
+    xvfcmp.clt.d VINC8, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC8
+#else
+    xvadd.w VI1, VI1, VINC8
+    xvld VX1, X, 8 * SIZE
+    addi.d I, I, -1
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s x3, VI4, x1
+    xvfmul.s x4, VI4, x2
+    xvfcmp.clt.s VT0, x1, VI3
+    xvfcmp.clt.s VINC4, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC4
+#endif
+    XVFADD  x1,  x1,  x2
+    XVFMAX  x3,  VM0, x1
+    XVCMPEQ VT0, x3,  VM0
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, x3, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmax.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, VM1, x1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmax.d VM0, x3, x4
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmax.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#endif
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT  VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 1 //3
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI0, i0, 2 //2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 2 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //6
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI0, i0, 4 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //4
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    xvadd.d VI1, VI1, VINC4
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    addi.d I, I, -1
+    xvfmul.d x3, VI4, x1
+    xvfmul.d x4, VI4, x2
+    xvfcmp.clt.d VT0, x1, VI3
+    xvfcmp.clt.d VINC8, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC8
+    xvfadd.d x1, x1, x2
+    xvfmax.d x3, VM0, x1
+    xvfcmp.ceq.d VT0, x3, VM0
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    xvadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    xvadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    addi.d I, I, -1
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s x3, VI4, x1
+    xvfmul.s x4, VI4, x2
+    xvfcmp.clt.s VT0, x1, VI3
+    xvfcmp.clt.s VINC8, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC8
+    xvfadd.s x1, x1, x2
+    xvfmax.s x3, VM0, x1
+    xvfcmp.ceq.s VT0, x3, VM0
+#endif
+    xvbitsel.v VM0, x3, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmaxa.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, VM1, x1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmaxa.d VM0, x3, x4
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmaxa.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#endif
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT  VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    XVCMPLT  VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    XVCMPLT  VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    XVCMPLT  VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#ifdef DOUBLE
+    movfr2gr.d i0, $f20
+#else
+    fmov.s $f16, $f20
+#endif
+    .align 3
+
+#ifdef DOUBLE
+#else
+.L252:
+    xvxor.v VI0, VI0, VI0
+    xvor.v VI0, VI0, VX0
+    fmov.s $f13, $f15
+    xvxor.v VM0, VM0, VM0
+    xvor.v VM0, VM0, VX1
+    xvpickve.w VI1, VI0, 4
+    xvpickve.w VI2, VI0, 5
+    xvpickve.w VI3, VI0, 6
+    xvpickve.w VI4, VI0, 7
+    xvpickve.w x1, VM0, 4
+    xvpickve.w x2, VM0, 5
+    xvpickve.w x3, VM0, 6
+    xvpickve.w x4, VM0, 7
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v x1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, x1
+    xvbitsel.v VM0, VM0, x1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L262
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L262:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L272
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L272:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L282
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L282:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L292
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L292:
+    fcmp.clt.s $fcc0, $f15, $f13
+    fsel $f15, $f15, $f13, $fcc0
+    fsel $f20, $f20, $f16, $fcc0
+    movfr2gr.s i0, $f20
+
+#endif
+.L21: //N<8
+#ifdef DOUBLE
+    andi I, N, 3
+    bge $r0, I, .L999
+    srai.d i1, N, 2
+    slli.d i1, i1, 2
+#else
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+#endif
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    FMAX a1, s1, a0
+    CMPEQ $fcc0, s1, a1
+    add.d  X, X, INCX
+    fsel s1, a1, s1, $fcc0
+    fsel $f20, $f21, $f20, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG i0, $f20
+    .align 3
+
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/icamax_lsx.S b/kernel/loongarch64/icamax_lsx.S
new file mode 100644
index 000000000..a2fc9dbbd
--- /dev/null
+++ b/kernel/loongarch64/icamax_lsx.S
@@ -0,0 +1,434 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define a0 $f12
+#define a1 $f13
+#define s1 $f15
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VINC4 $vr17
+#define VINC8 $vr18
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    vxor.v VM0, VM0, VM0
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    vxor.v VI3, VI3, VI3 // 0
+#ifdef DOUBLE
+    li.d I, -1
+    vreplgr2vr.d VI4, I
+    vffint.d.l VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -3
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    li.w I, -1
+    vreplgr2vr.w VI4, I
+    vffint.s.w VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.w i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    addi.w i0, i0, -7
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 2 * SIZE
+    addi.d I, I, -1
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VI4, x1
+    vfmul.d x4, VI4, x2
+    vfcmp.clt.d VT0, x1, VI3
+    vfcmp.clt.d VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    vfadd.d x1, x1, x2
+    vfmax.d x3, VM0, x1
+    vfcmp.ceq.d VT0, x3, VM0
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VI4, x1
+    vfmul.d x4, VI4, x2
+#else
+    vadd.w VI1, VI1, VINC4
+    vld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, VI4, x1
+    vfmul.s x4, VI4, x2
+#endif
+    VCMPLT  VT0, x1, VI3
+    VCMPLT  VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    VFADD  x1, x1, x2
+    VFMAX  x3, VM0, x1
+    VCMPEQ VT0, x3, VM0
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmaxa.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmaxa.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmaxa.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+#endif
+    .align 3
+
+.L20: // INCX!=1
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -3
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    addi.w i0, i0, -7
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vadd.d VI1, VI1, VINC4
+    vfmul.d x3, VI4, x1
+    vfmul.d x4, VI4, x2
+    vfcmp.clt.d VT0, x1, VI3
+    vfcmp.clt.d VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    vfadd.d x1, x1, x2
+    vfmax.d x3, VM0, x1
+    ld.d t1, X, 0 * SIZE
+    vfcmp.ceq.d VT0, x3, VM0
+    ld.d t2, X, 1 * SIZE
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vadd.d VI1, VI1, VINC4
+    addi.d I, I, -1
+    vfmul.d x3, VI4, x1
+    vfmul.d x4, VI4, x2
+    vfcmp.clt.d VT0, x1, VI3
+    vfcmp.clt.d VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    vfadd.d x1, x1, x2
+    vfmax.d x3, VM0, x1
+    vfcmp.ceq.d VT0, x3, VM0
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    vadd.w VI1, VI1, VINC4
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    addi.d I, I, -1
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, VI4, x1
+    vfmul.s x4, VI4, x2
+    vfcmp.clt.s VT0, x1, VI3
+    vfcmp.clt.s VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    vfadd.s x1, x1, x2
+    vfmax.s x3, VM0, x1
+    vfcmp.ceq.s VT0, x3, VM0
+#endif
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmaxa.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmaxa.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmaxa.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+#endif
+    .align 3
+
+#ifdef DOUBLE
+.L26:
+    vfmaxa.d VM0, x1, x2
+    vfcmp.ceq.d VT0, x1, VM0
+    vbitsel.v VI0, VI2, VI1, VT0
+    .align 3
+
+.L27:
+    movfr2gr.d i0, $f20
+    .align 3
+#else
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    vfcmp.clt.s VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    vfcmp.clt.s VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    vfcmp.clt.s VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.s i0, $f20
+    .align 3
+
+#endif
+.L21: //N<4
+    andi I, N, 3
+    bge $r0, I, .L999
+    srai.d i1, N, 2
+    slli.d i1, i1, 2
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    FMAX a1, s1, a0
+    CMPEQ $fcc0, s1, a1
+    add.d  X, X, INCX
+    fsel s1, a1, s1, $fcc0
+    fsel $f20, $f21, $f20, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG  i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/icamin_lasx.S b/kernel/loongarch64/icamin_lasx.S
new file mode 100644
index 000000000..01abd45b2
--- /dev/null
+++ b/kernel/loongarch64/icamin_lasx.S
@@ -0,0 +1,555 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define a0 $f12
+#define a1 $f13
+#define s1 $f15
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    LD   a0, X, 0 * SIZE
+    LD   a1, X, 1 * SIZE
+    FABS a0, a0
+    FABS a1, a1
+    ADD  s1, a1, a0
+#ifdef DOUBLE
+    xvreplve0.d VM0, VM0
+    xvxor.v VI3, VI3, VI3 // 0
+    li.d I, -1
+    xvreplgr2vr.d VI4, I
+    xvffint.d.l VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 1 //3
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI0, i0, 2 //2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    xvreplve0.w VM0, VM0
+    xvxor.v VI3, VI3, VI3 // 0
+    li.w I, -1
+    xvreplgr2vr.w VI4, I
+    xvffint.s.w VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 2 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //6
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI0, i0, 4 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //4
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvadd.d VI1, VI1, VINC4
+    xvld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmul.d x3, VI4, x1
+    xvfmul.d x4, VI4, x2
+    xvfcmp.clt.d VT0, x1, VI3
+    xvfcmp.clt.d VINC8, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC8
+#else
+    xvadd.w VI1, VI1, VINC8
+    xvld VX1, X, 8 * SIZE
+    addi.d I, I, -1
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s x3, VI4, x1
+    xvfmul.s x4, VI4, x2
+    xvfcmp.clt.s VT0, x1, VI3
+    xvfcmp.clt.s VINC4, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC4
+#endif
+    XVFADD  x1, x1, x2
+    XVFMIN  x3, VM0, x1
+    XVCMPEQ VT0, x3, VM0
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, x3, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmin.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, VM1, x1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmin.d VM0, x3, x4
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmin.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#endif
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT  VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 1 //3
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI0, i0, 2 //2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 2 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //6
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI0, i0, 4 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //4
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    xvadd.d VI1, VI1, VINC4
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    xvadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    xvadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+#endif
+    addi.d I, I, -1
+    XVFMUL x3, VI4, x1
+    XVFMUL x4, VI4, x2
+    XVCMPLT VT0, x1, VI3
+    XVCMPLT VINC8, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC8
+    XVFADD x1, x1, x2
+    XVFMIN x3, VM0, x1
+    XVCMPEQ VT0, x3, VM0
+    xvbitsel.v VM0, x3, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmina.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, VM1, x1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmina.d VM0, x3, x4
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmina.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+#endif
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT  VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    XVCMPLT  VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    XVCMPLT  VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    XVCMPLT  VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#ifdef DOUBLE
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L21: //N<4
+    andi I, N, 3
+    bge $r0, I, .L999
+    srai.d i1, N, 2
+    slli.d i1, i1, 2
+#else
+    fmov.s $f16, $f20
+    .align 3
+
+.L252:
+    xvxor.v VI0, VI0, VI0
+    xvor.v VI0, VI0, VX0
+    fmov.s $f13, $f15
+    xvxor.v VM0, VM0, VM0
+    xvor.v VM0, VM0, VX1
+    xvpickve.w VI1, VI0, 4
+    xvpickve.w VI2, VI0, 5
+    xvpickve.w VI3, VI0, 6
+    xvpickve.w VI4, VI0, 7
+    xvpickve.w x1, VM0, 4
+    xvpickve.w x2, VM0, 5
+    xvpickve.w x3, VM0, 6
+    xvpickve.w x4, VM0, 7
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v x1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, x1
+    xvbitsel.v VM0, VM0, x1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L262
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L262:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L272
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L272:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L282
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L282:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L292
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L292:
+    fcmp.clt.s $fcc0, $f15, $f13
+    fsel $f15, $f15, $f13, $fcc0
+    fsel $f20, $f20, $f16, $fcc0
+    movfr2gr.s i0, $f20
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+#endif
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    FMIN a1, s1, a0
+    CMPEQ $fcc0, s1, a1
+    add.d  X, X, INCX
+    fsel s1, a1, s1, $fcc0
+    fsel $f20, $f21, $f20, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG  i0, $f20
+    .align 3
+
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/icamin_lsx.S b/kernel/loongarch64/icamin_lsx.S
new file mode 100644
index 000000000..a08cd33c5
--- /dev/null
+++ b/kernel/loongarch64/icamin_lsx.S
@@ -0,0 +1,425 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define a0 $f12
+#define a1 $f13
+#define s1 $f15
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VINC4 $vr17
+#define VINC8 $vr18
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    LD   a0, X, 0 * SIZE
+    LD   a1, X, 1 * SIZE
+    FABS a0, a0
+    FABS a1, a1
+    ADD  s1, a1, a0
+    vreplvei.w VM0, VM0, 0
+    vxor.v VI3, VI3, VI3 // 0
+#ifdef DOUBLE
+    li.d I, -1
+    vreplgr2vr.d VI4, I
+    vffint.d.l VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -3
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    li.w I, -1
+    vreplgr2vr.w VI4, I
+    vffint.s.w VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.w i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    addi.w i0, i0, -7
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 2 * SIZE
+    addi.d I, I, -1
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VI4, x1
+    vfmul.d x4, VI4, x2
+    vfcmp.clt.d VT0, x1, VI3
+    vfcmp.clt.d VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    vfadd.d x1, x1, x2
+    vfmin.d x3, VM0, x1
+    vfcmp.ceq.d VT0, x3, VM0
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
+    vadd.w VI1, VI1, VINC4
+    vld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+    VFMUL  x3, VI4, x1
+    VFMUL  x4, VI4, x2
+    VCMPLT VT0, x1, VI3
+    VCMPLT VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    VFADD  x1, x1, x2
+    VFMIN  x3, VM0, x1
+    VCMPEQ VT0, x3, VM0
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmina.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmina.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmina.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+#endif
+    .align 3
+
+.L20: // INCX!=1
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -3
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    addi.w i0, i0, -7
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vadd.d VI1, VI1, VINC4
+    vfmul.d x3, VI4, x1
+    vfmul.d x4, VI4, x2
+    vfcmp.clt.d VT0, x1, VI3
+    vfcmp.clt.d VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    vfadd.d x1, x1, x2
+    vfmin.d x3, VM0, x1
+    ld.d t1, X, 0 * SIZE
+    vfcmp.ceq.d VT0, x3, VM0
+    ld.d t2, X, 1 * SIZE
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vadd.d VI1, VI1, VINC4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    vadd.w VI1, VI1, VINC4
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+    addi.d I, I, -1
+    VFMUL  x3, VI4, x1
+    VFMUL  x4, VI4, x2
+    VCMPLT VT0, x1, VI3
+    VCMPLT VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    VFADD  x1, x1, x2
+    VFMIN  x3, VM0, x1
+    VCMPEQ VT0, x3, VM0
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmina.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmina.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmina.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+#endif
+    .align 3
+
+.L26:
+#ifdef DOUBLE
+    vfmina.d VM0, x1, x2
+    vfcmp.ceq.d VT0, x1, VM0
+#else
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    vfcmp.clt.s VT0, VI2, VI0
+#endif
+    vbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+#ifdef DOUBLE
+    movfr2gr.d i0, $f20
+    .align 3
+#else
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    vfcmp.clt.s VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    vfcmp.clt.s VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.s i0, $f20
+    .align 3
+
+#endif
+.L21: //N<4
+    andi I, N, 3
+    bge $r0, I, .L999
+    srai.d i1, N, 2
+    slli.d i1, i1, 2
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    FMIN a1, s1, a0
+    CMPEQ $fcc0, s1, a1
+    add.d  X, X, INCX
+    fsel s1, a1, s1, $fcc0
+    fsel $f20, $f21, $f20, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG  i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/imax_lasx.S b/kernel/loongarch64/imax_lasx.S
new file mode 100644
index 000000000..2d3d5e9d3
--- /dev/null
+++ b/kernel/loongarch64/imax_lasx.S
@@ -0,0 +1,533 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvadd.d VI1, VI1, VINC8
+    xvld VX1, X, 4 * SIZE
+    xvadd.d VI2, VI1, VINC4
+    xvfcmp.clt.d VT0, VX0, VX1
+    addi.d I, I, -1
+    xvbitsel.v VM1, VX0, VX1, VT0
+    xvbitsel.v VI2, VI1, VI2, VT0
+    xvfcmp.clt.d VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VI0, VI2, VT0
+#else
+    xvadd.w VI1, VI1, VINC8
+    xvfcmp.clt.s VT0, VM0, VX0
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM0, VX0, VT0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    addi.d X, X, 8 * SIZE
+#endif
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfcmp.clt.d VT0, x1, x2
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfcmp.clt.s VT0, x1, x2
+#endif
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    XVCMPLT VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    XVCMPLT VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+#else
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 0
+    xvinsgr2vr.w VM0, t2, 1
+    xvinsgr2vr.w VM0, t3, 2
+    xvinsgr2vr.w VM0, t4, 3
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t2, TEMP, 0 * SIZE
+#endif
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+#ifdef DOUBLE
+    xvinsgr2vr.d VM0, t1, 0
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    xvinsgr2vr.w VM0, t1, 4
+    xvinsgr2vr.w VM0, t2, 5
+    xvinsgr2vr.w VM0, t3, 6
+    xvinsgr2vr.w VM0, t4, 7
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    xvadd.d VI2, VI1, VINC4
+    xvfcmp.clt.d VT0, VX0, VX1
+    addi.d I, I, -1
+    xvbitsel.v VM1, VX0, VX1, VT0
+    xvbitsel.v VI2, VI1, VI2, VT0
+    xvfcmp.clt.d VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VI0, VI2, VT0
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvadd.w VI1, VI1, VINC8
+    xvfcmp.clt.s VT0, VM0, VX0
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM0, VX0, VT0
+    xvbitsel.v VI0, VI0, VI1, VT0
+#endif
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d  x1, VM0, 0
+    xvpickve.d  x2, VM0, 1
+    xvpickve.d  x3, VM0, 2
+    xvpickve.d  x4, VM0, 3
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w  x1, VM0, 0
+    xvpickve.w  x2, VM0, 1
+    xvpickve.w  x3, VM0, 2
+    xvpickve.w  x4, VM0, 3
+#endif
+    XVCMPLT VT0, x1, x2
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    XVCMPLT VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    XVCMPLT VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    XVCMPLT VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    XVCMPLT VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    XVCMPLT VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#ifdef DOUBLE
+    movfr2gr.d i0, $f20
+#else
+    fmov.s $f16, $f20
+#endif
+    .align 3
+
+#ifndef DOUBLE
+.L252:
+    xvxor.v VI0, VI0, VI0
+    xvor.v VI0, VI0, VX0
+    fmov.s $f13, $f15
+    xvxor.v VM0, VM0, VM0
+    xvor.v VM0, VM0, VX1
+    xvpickve.w VI1, VI0, 4
+    xvpickve.w VI2, VI0, 5
+    xvpickve.w VI3, VI0, 6
+    xvpickve.w VI4, VI0, 7
+    xvpickve.w x1, VM0, 4
+    xvpickve.w x2, VM0, 5
+    xvpickve.w x3, VM0, 6
+    xvpickve.w x4, VM0, 7
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v x1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, x1
+    xvbitsel.v VM0, VM0, x1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    xvfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L262
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L262:
+    xvfcmp.ceq.s VT0, VM0, x2
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L272
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L272:
+    xvfcmp.ceq.s VT0, VM0, x3
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L282
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L282:
+    xvfcmp.ceq.s VT0, VM0, x4
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L292
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L292:
+    fcmp.clt.s $fcc0, $f15, $f13
+    fsel $f15, $f15, $f13, $fcc0
+    fsel $f20, $f20, $f16, $fcc0
+    movfr2gr.s i0, $f20
+#endif
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    CMPLT $fcc0, $f15, $f9
+    add.d  X, X, INCX
+    fsel $f15, $f15, $f9, $fcc0
+    fsel $f20, $f20, $f21, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/imax_lsx.S b/kernel/loongarch64/imax_lsx.S
new file mode 100644
index 000000000..92556d4e6
--- /dev/null
+++ b/kernel/loongarch64/imax_lsx.S
@@ -0,0 +1,428 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+#ifdef DOUBLE
+    addi.d  i0, i0, 1
+    srai.d   I,  N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d $vr17, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d $vr18, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w  i0, i0, 1
+    srai.d   I,  N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w $vr17, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w $vr18, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vadd.d VI1, VI1, $vr18
+    vld VX1, X, 2 * SIZE
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX0, VX1
+    vbitsel.v x1, VX0, VX1, VT0
+    vbitsel.v x2, VI1, VI2, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI2, $vr17
+    vld VX1, X, 6 * SIZE
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX0, VX1
+    addi.d I, I, -1
+    vbitsel.v x3, VX0, VX1, VT0
+    vbitsel.v x4, VI1, VI2, VT0
+    VCMPLT VT0, x1, x3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT0
+    VCMPLT VT0, VM0, x1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM0, x1, VT0
+    vbitsel.v VI0, VI0, x2, VT0
+#else
+    vadd.w VI1, VI1, $vr18
+    vld VX1, X, 4 * SIZE
+    vadd.w VI2, VI1, $vr17
+    VCMPLT VT0, VX0, VX1
+    addi.d I, I, -1
+    vbitsel.v VM1, VX0, VX1, VT0
+    vbitsel.v VI2, VI1, VI2, VT0
+    VCMPLT VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VI0, VI2, VT0
+#endif
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d  x1, VM0, 0
+    vreplvei.d  x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w  x1, VM0, 0
+    vreplvei.w  x2, VM0, 1
+    vreplvei.w  x3, VM0, 2
+    vreplvei.w  x4, VM0, 3
+    VCMPLT VT0, x1, x2
+    vbitsel.v VM1, x1, x2, VT0
+    vbitsel.v $vr17, VI1, VI2, VT0
+    VCMPLT VT0, x3, x4
+    vbitsel.v VM0, x3, x4, VT0
+    vbitsel.v $vr18, VI3, VI4, VT0
+    VCMPLT VT0, VM0, VM1
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, $vr18, $vr17, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+#endif
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+#ifdef DOUBLE
+    addi.d  i0,   i0, 1
+    ld.d    t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d  VM0, t1, 0
+    srai.d         I,  N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t2, 1
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d $vr17, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d $vr18, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w  i0,   i0, 1
+    ld.w    t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w  VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w $vr17, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w $vr18, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI1, $vr18
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX0, VX1
+    vbitsel.v x1, VX0, VX1, VT0
+    vbitsel.v x2, VI1, VI2, VT0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI2, $vr17
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX0, VX1
+    vbitsel.v x3, VX0, VX1, VT0
+    vbitsel.v x4, VI1, VI2, VT0
+    VCMPLT VT0, x1, x3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT0
+    VCMPLT VT0, VM0, x1
+    addi.d I, I, -1
+    vbitsel.v VM0, VM0, x1, VT0
+    vbitsel.v VI0, VI0, x2, VT0
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vadd.w VI1, VI1, $vr18
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vadd.w VI2, VI1, $vr17
+    VCMPLT VT0, VX0, VX1
+    addi.d I, I, -1
+    vbitsel.v VM1, VX0, VX1, VT0
+    vbitsel.v VI2, VI1, VI2, VT0
+    VCMPLT VT0, VM0, VM1
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VI0, VI2, VT0
+#endif
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d  x1, VM0, 0
+    vreplvei.d  x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w  x1, VM0, 0
+    vreplvei.w  x2, VM0, 1
+    vreplvei.w  x3, VM0, 2
+    vreplvei.w  x4, VM0, 3
+    vfcmp.clt.s VT0,  x1, x2
+    vbitsel.v   VM1,  x1, x2, VT0
+    vbitsel.v $vr17, VI1, VI2, VT0
+    vfcmp.clt.s VT0,  x3, x4
+    vbitsel.v   VM0,  x3, x4, VT0
+    vbitsel.v $vr18, VI3, VI4, VT0
+    vfcmp.clt.s VT0, VM0, VM1
+    vbitsel.v VM0,   VM0, VM1, VT0
+    vbitsel.v VI0, $vr18, $vr17, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+#endif
+    .align 3
+
+.L26:
+#ifdef DOUBLE
+    VCMPLT    VT0,  x1, x2
+    vbitsel.v VM0,  x1, x2, VT0
+    vbitsel.v VI0, VI1, VI2, VT0
+#else
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    VCMPLT VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+#endif
+    .align 3
+
+.L27:
+#ifndef DOUBLE
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    VCMPLT VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    VCMPLT VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#endif
+    MTG i0, $f20
+    .align 3
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    CMPLT $fcc0, $f15, $f9
+    add.d  X, X, INCX
+    fsel $f15, $f15, $f9, $fcc0
+    fsel $f20, $f20, $f21, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/imin_lasx.S b/kernel/loongarch64/imin_lasx.S
new file mode 100644
index 000000000..5306828e2
--- /dev/null
+++ b/kernel/loongarch64/imin_lasx.S
@@ -0,0 +1,534 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvadd.d VI1, VI1, VINC8
+    xvld VX1, X, 4 * SIZE
+    xvadd.d VI2, VI1, VINC4
+    XVCMPLT VT0, VX1, VX0
+    addi.d I, I, -1
+    xvbitsel.v VM1, VX0, VX1, VT0
+    xvbitsel.v VI2, VI1, VI2, VT0
+    XVCMPLT VT0, VM1, VM0
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VI0, VI2, VT0
+#else
+    xvadd.w VI1, VI1, VINC8
+    XVCMPLT VT0, VX0, VM0
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM0, VX0, VT0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    addi.d X, X, 8 * SIZE
+#endif
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d  x1, VM0, 0
+    xvpickve.d  x2, VM0, 1
+    xvpickve.d  x3, VM0, 2
+    xvpickve.d  x4, VM0, 3
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w  x1, VM0, 0
+    xvpickve.w  x2, VM0, 1
+    xvpickve.w  x3, VM0, 2
+    xvpickve.w  x4, VM0, 3
+#endif
+    XVCMPLT VT0, x2, x1
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    XVCMPLT VT0, x4, x3
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    XVCMPLT VT0, VM1, VM0
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 0
+    xvinsgr2vr.w VM0, t2, 1
+    xvinsgr2vr.w VM0, t3, 2
+    xvinsgr2vr.w VM0, t4, 3
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 4
+    xvinsgr2vr.w VM0, t2, 5
+    xvinsgr2vr.w VM0, t3, 6
+    xvinsgr2vr.w VM0, t4, 7
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    xvadd.d VI2, VI1, VINC4
+    XVCMPLT VT0, VX1, VX0
+    addi.d I, I, -1
+    xvbitsel.v VM1, VX0, VX1, VT0
+    xvbitsel.v VI2, VI1, VI2, VT0
+    XVCMPLT VT0, VM1, VM0
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VI0, VI2, VT0
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvadd.w VI1, VI1, VINC8
+    XVCMPLT VT0, VX0, VM0
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM0, VX0, VT0
+    xvbitsel.v VI0, VI0, VI1, VT0
+#endif
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1,  VM0, 0
+    xvpickve.d x2,  VM0, 1
+    xvpickve.d x3,  VM0, 2
+    xvpickve.d x4,  VM0, 3
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v  VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v  VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w  x1, VM0, 0
+    xvpickve.w  x2, VM0, 1
+    xvpickve.w  x3, VM0, 2
+    xvpickve.w  x4, VM0, 3
+#endif
+    XVCMPLT    VT0, x2, x1
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    XVCMPLT    VT0, x4, x3
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    XVCMPLT VT0, VM1, VM0
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    XVCMPLT VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    XVCMPLT VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    XVCMPLT VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#ifdef DOUBLE
+    MTG i0, $f20
+#else
+    fmov.s $f16, $f20
+#endif
+    .align 3
+
+#ifndef DOUBLE
+.L252:
+    xvxor.v VI0, VI0, VI0
+    xvor.v VI0, VI0, VX0
+    fmov.s $f13, $f15
+    xvxor.v VM0, VM0, VM0
+    xvor.v VM0, VM0, VX1
+    xvpickve.w VI1, VI0, 4
+    xvpickve.w VI2, VI0, 5
+    xvpickve.w VI3, VI0, 6
+    xvpickve.w VI4, VI0, 7
+    xvpickve.w x1, VM0, 4
+    xvpickve.w x2, VM0, 5
+    xvpickve.w x3, VM0, 6
+    xvpickve.w x4, VM0, 7
+    XVCMPLT VT0, x2, x1
+    xvbitsel.v x1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    XVCMPLT VT0, x4, x3
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    XVCMPLT VT0, x1, VM0
+    xvbitsel.v VM0, VM0, x1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    xvfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L262
+    XVCMPLT VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L262:
+    xvfcmp.ceq.s VT0, VM0, x2
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L272
+    XVCMPLT VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L272:
+    xvfcmp.ceq.s VT0, VM0, x3
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L282
+    XVCMPLT VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L282:
+    xvfcmp.ceq.s VT0, VM0, x4
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L292
+    XVCMPLT VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L292:
+    CMPLT $fcc0, $f13, $f15
+    fsel $f15, $f15, $f13, $fcc0
+    fsel $f20, $f20, $f16, $fcc0
+    MTG i0, $f20
+#endif
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    CMPLT $fcc0, $f9, $f15
+    add.d  X, X, INCX
+    fsel $f15, $f15, $f9, $fcc0
+    fsel $f20, $f20, $f21, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/imin_lsx.S b/kernel/loongarch64/imin_lsx.S
new file mode 100644
index 000000000..a0c411e7a
--- /dev/null
+++ b/kernel/loongarch64/imin_lsx.S
@@ -0,0 +1,428 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d  I, N, 3
+    bge   $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d $vr17, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d $vr18, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w i0, i0, 1
+    srai.d  I, N, 3
+    bge   $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w $vr17, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w $vr18, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vadd.d VI1, VI1, $vr18
+    vld VX1, X, 2 * SIZE
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX1, VX0
+    vbitsel.v x1, VX0, VX1, VT0
+    vbitsel.v x2, VI1, VI2, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI2, $vr17
+    vld VX1, X, 6 * SIZE
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX1, VX0
+    addi.d I, I, -1
+    vbitsel.v x3, VX0, VX1, VT0
+    vbitsel.v x4, VI1, VI2, VT0
+    VCMPLT VT0, x3, x1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT0
+    VCMPLT VT0, x1, VM0
+    vbitsel.v VM0, VM0, x1, VT0
+    vbitsel.v VI0, VI0, x2, VT0
+#else
+    vadd.w VI1, VI1, $vr18
+    vld VX1, X, 4 * SIZE
+    vadd.w VI2, VI1, $vr17
+    VCMPLT VT0, VX1, VX0
+    addi.d I, I, -1
+    vbitsel.v VM1, VX0, VX1, VT0
+    vbitsel.v VI2, VI1, VI2, VT0
+    VCMPLT VT0, VM1, VM0
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VI0, VI2, VT0
+#endif
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    VCMPLT VT0, x2, x1
+    vbitsel.v VM1, x1, x2, VT0
+    vbitsel.v $vr17, VI1, VI2, VT0
+    VCMPLT VT0, x4, x3
+    vbitsel.v VM0, x3, x4, VT0
+    vbitsel.v $vr18, VI3, VI4, VT0
+    VCMPLT VT0, VM1, VM0
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, $vr18, $vr17, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+#endif
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t2, 1
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d $vr17, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d $vr18, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w $vr17, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w $vr18, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI1, $vr18
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX1, VX0
+    vbitsel.v x1, VX0, VX1, VT0
+    vbitsel.v x2, VI1, VI2, VT0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI2, $vr17
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX1, VX0
+    vbitsel.v x3, VX0, VX1, VT0
+    vbitsel.v x4, VI1, VI2, VT0
+    VCMPLT VT0, x3, x1
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT0
+    VCMPLT VT0, x1, VM0
+    addi.d I, I, -1
+    vbitsel.v VM0, VM0, x1, VT0
+    vbitsel.v VI0, VI0, x2, VT0
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vadd.w VI1, VI1, $vr18
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vadd.w VI2, VI1, $vr17
+    VCMPLT VT0, VX1, VX0
+    addi.d I, I, -1
+    vbitsel.v VM1, VX0, VX1, VT0
+    vbitsel.v VI2, VI1, VI2, VT0
+    VCMPLT VT0, VM1, VM0
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VI0, VI2, VT0
+#endif
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    VCMPLT VT0, x2, x1
+    vbitsel.v VM1, x1, x2, VT0
+    vbitsel.v $vr17, VI1, VI2, VT0
+    VCMPLT VT0, x4, x3
+    vbitsel.v VM0, x3, x4, VT0
+    vbitsel.v $vr18, VI3, VI4, VT0
+    VCMPLT VT0, VM1, VM0
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, $vr18, $vr17, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+#endif
+    .align 3
+
+.L26:
+#ifdef DOUBLE
+    VCMPLT VT0, x2, x1
+    vbitsel.v VM0, x1, x2, VT0
+    vbitsel.v VI0, VI1, VI2, VT0
+#else
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    VCMPLT VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+#endif
+    .align 3
+
+.L27:
+#ifndef DOUBLE
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    VCMPLT VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    VCMPLT VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#endif
+    MTG i0, $f20
+    .align 3
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    CMPLT $fcc0, $f9, $f15
+    add.d  X, X, INCX
+    fsel $f15, $f15, $f9, $fcc0
+    fsel $f20, $f20, $f21, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/max_lasx.S b/kernel/loongarch64/max_lasx.S
new file mode 100644
index 000000000..3215ae394
--- /dev/null
+++ b/kernel/loongarch64/max_lasx.S
@@ -0,0 +1,229 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $xr0
+#define VM1 $xr1
+#define VM2 $xr2
+#define VX0 $xr3
+#define VX1 $xr4
+#define VX2 $xr5
+#define VX3 $xr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    xvldrepl.d VM0, X, 0
+#else
+    xvldrepl.w VM0, X, 0
+#endif
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 4
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    xvld VX2, X, 64
+    xvld VX3, X, 96
+    addi.d I, I, -1
+    addi.d X, X, 128
+    XVFMAX    VM1, VX0, VX1
+    XVFMAX    VM2, VX2, VX3
+    XVFMAX    VM0, VM0, VM1
+    XVFMAX    VM0, VM0, VM2
+#else
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    addi.d I, I, -1
+    addi.d X, X, 64
+    XVFMAX    VM1, VX0, VX1
+    XVFMAX    VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMAX    VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMAX    VM1, VX0, VX1
+    XVFMAX    VM2, VX2, VX3
+    XVFMAX    VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 0x1
+    XVFMAX    VM0, VM0, VM1
+    .align 3
+
+.L11:
+    andi  I, N, 0x0f
+    bge $r0, I, .L13
+    .align 3
+
+.L12: /* 0 < N < 16 */
+    LD    $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMAX  $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 0
+    xvinsgr2vr.w VM1, t2, 1
+    xvinsgr2vr.w VM1, t3, 2
+    xvinsgr2vr.w VM1, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 4
+    xvinsgr2vr.w VM1, t2, 5
+    xvinsgr2vr.w VM1, t3, 6
+    xvinsgr2vr.w VM1, t4, 7
+    xvfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMAX    VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMAX    VM1, VX0, VX1
+    XVFMAX    VM2, VX2, VX3
+    XVFMAX    VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 1
+    XVFMAX    VM0, VM0, VM1
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24: /* 0 < N < 8 */
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMAX  $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/max_lsx.S b/kernel/loongarch64/max_lsx.S
new file mode 100644
index 000000000..a2100875d
--- /dev/null
+++ b/kernel/loongarch64/max_lsx.S
@@ -0,0 +1,228 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $vr0
+#define VM1 $vr1
+#define VM2 $vr2
+#define VX0 $vr3
+#define VX1 $vr4
+#define VX2 $vr5
+#define VX3 $vr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    vldrepl.d VM0, X, 0
+#else
+    vldrepl.w VM0, X, 0
+#endif
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 3
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vld VX2, X, 32
+    vld VX3, X, 48
+    addi.d I, I, -1
+    addi.d X, X, 64
+    VFMAX    VM1, VX0, VX1
+    VFMAX    VM2, VX2, VX3
+    VFMAX    VM0, VM0, VM1
+    VFMAX    VM0, VM0, VM2
+#else
+    vld VX0, X, 0
+    vld VX1, X, 16
+    addi.d I, I, -1
+    addi.d X, X, 32
+    VFMAX    VM1, VX0, VX1
+    VFMAX    VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMAX    VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMAX    VM1, VX0, VX1
+    VFMAX    VM2, VX2, VX3
+    VFMAX    VM0, VM1, VM2
+#endif
+    .align 3
+
+.L11:
+    andi I, N, 7
+    bge $r0, I, .L13
+    .align 3
+
+.L12:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMAX  $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM1, VM1, VM2
+    vfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmaxa.s VM1, VX0, VX1
+    vfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMAX    VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMAX    VM1, VX0, VX1
+    VFMAX    VM2, VX2, VX3
+    VFMAX    VM0, VM1, VM2
+#endif
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMAX  $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/min_lasx.S b/kernel/loongarch64/min_lasx.S
new file mode 100644
index 000000000..890c8882d
--- /dev/null
+++ b/kernel/loongarch64/min_lasx.S
@@ -0,0 +1,229 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $xr0
+#define VM1 $xr1
+#define VM2 $xr2
+#define VX0 $xr3
+#define VX1 $xr4
+#define VX2 $xr5
+#define VX3 $xr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    xvldrepl.d VM0, X, 0
+#else
+    xvldrepl.w VM0, X, 0
+#endif
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 4
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    xvld VX2, X, 64
+    xvld VX3, X, 96
+    addi.d I, I, -1
+    addi.d X, X, 128
+    XVFMIN    VM1, VX0, VX1
+    XVFMIN    VM2, VX2, VX3
+    XVFMIN    VM0, VM0, VM1
+    XVFMIN    VM0, VM0, VM2
+#else
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    addi.d I, I, -1
+    addi.d X, X, 64
+    XVFMIN    VM1, VX0, VX1
+    XVFMIN    VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMIN    VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMIN    VM1, VX0, VX1
+    XVFMIN    VM2, VX2, VX3
+    XVFMIN    VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 0x1
+    XVFMIN    VM0, VM0, VM1
+    .align 3
+
+.L11:
+    andi  I, N, 0x0f
+    bge $r0, I, .L13
+    .align 3
+
+.L12: /* 0 < N < 16 */
+    LD    $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMIN  $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 0
+    xvinsgr2vr.w VM1, t2, 1
+    xvinsgr2vr.w VM1, t3, 2
+    xvinsgr2vr.w VM1, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 4
+    xvinsgr2vr.w VM1, t2, 5
+    xvinsgr2vr.w VM1, t3, 6
+    xvinsgr2vr.w VM1, t4, 7
+    xvfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMIN    VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMIN    VM1, VX0, VX1
+    XVFMIN    VM2, VX2, VX3
+    XVFMIN    VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 1
+    XVFMIN    VM0, VM0, VM1
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24: /* 0 < N < 8 */
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMIN  $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/min_lsx.S b/kernel/loongarch64/min_lsx.S
new file mode 100644
index 000000000..1bc32a0ed
--- /dev/null
+++ b/kernel/loongarch64/min_lsx.S
@@ -0,0 +1,228 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $vr0
+#define VM1 $vr1
+#define VM2 $vr2
+#define VX0 $vr3
+#define VX1 $vr4
+#define VX2 $vr5
+#define VX3 $vr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    vldrepl.d VM0, X, 0
+#else
+    vldrepl.w VM0, X, 0
+#endif
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 3
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vld VX2, X, 32
+    vld VX3, X, 48
+    addi.d I, I, -1
+    addi.d X, X, 64
+    VFMIN    VM1, VX0, VX1
+    VFMIN    VM2, VX2, VX3
+    VFMIN    VM0, VM0, VM1
+    VFMIN    VM0, VM0, VM2
+#else
+    vld VX0, X, 0
+    vld VX1, X, 16
+    addi.d I, I, -1
+    addi.d X, X, 32
+    VFMIN    VM1, VX0, VX1
+    VFMIN    VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMIN    VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMIN    VM1, VX0, VX1
+    VFMIN    VM2, VX2, VX3
+    VFMIN    VM0, VM1, VM2
+#endif
+    .align 3
+
+.L11:
+    andi I, N, 7
+    bge $r0, I, .L13
+    .align 3
+
+.L12:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMIN  $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM1, VM1, VM2
+    vfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmaxa.s VM1, VX0, VX1
+    vfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMIN    VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMIN    VM1, VX0, VX1
+    VFMIN    VM2, VX2, VX3
+    VFMIN    VM0, VM1, VM2
+#endif
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMIN  $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/rot_lasx.S b/kernel/loongarch64/rot_lasx.S
new file mode 100644
index 000000000..5d7e3d7cc
--- /dev/null
+++ b/kernel/loongarch64/rot_lasx.S
@@ -0,0 +1,1602 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define C      $f0
+#define S      $f1
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VT0    $xr10
+#define VT1    $xr18
+#define VXC    $xr23
+#define VXS    $xr9
+#define VXZ    $xr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT a1, a1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+#ifdef DOUBLE
+    movfr2gr.d t1, C
+    xvreplgr2vr.d VXC, t1
+    movfr2gr.d t2, S
+    xvreplgr2vr.d VXS, t2
+    movfr2gr.d t3, a1
+    xvreplgr2vr.d VXZ, t3
+#else
+    movfr2gr.s t1, C
+    xvreplgr2vr.w VXC, t1
+    movfr2gr.s t2, S
+    xvreplgr2vr.w VXS, t2
+    movfr2gr.s t3, a1
+    xvreplgr2vr.w VXZ, t3
+#endif
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L110
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L112 // C!=0 S==0
+    b .L111 // C!=0 S!=0
+    .align 3
+
+.L110:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L114 // C==0 S==0
+    b .L113 // C==0 S!=0
+    .align 3
+
+.L111: // C!=0 S!=0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+#endif
+    XVMUL VT0, VX0, VXC
+    XVFMADD VT0, VX2, VXS, VT0
+    XVMUL VT1, VX0, VXS
+    XVMSUB VT1, VX2, VXC, VT1
+    xvst VT0, X, 0 * SIZE
+    xvst VT1, Y, 0 * SIZE
+#ifdef DOUBLE
+    XVMUL VT0, VX1, VXC
+    XVFMADD VT0, VX3, VXS, VT0
+    XVMUL VT1, VX1, VXS
+    XVMSUB VT1, VX3, VXC, VT1
+    xvst VT0, X, 4 * SIZE
+    xvst VT1, Y, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // C!=0 S==0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+#endif
+    XVMUL VT0, VX0, VXC
+    XVMUL VT1, VX2, VXC
+    xvst VT0, X, 0 * SIZE
+    xvst VT1, Y, 0 * SIZE
+#ifdef DOUBLE
+    XVMUL VT0, VX1, VXC
+    XVMUL VT1, VX3, VXC
+    xvst VT0, X, 4 * SIZE
+    xvst VT1, Y, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // C==0 S!=0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+#endif
+    XVMUL VT0, VX2, VXS
+    XVMUL VT1, VX0, VXS
+    XVFSUB VT1, VXZ, VT1
+    xvst VT0, X, 0 * SIZE
+    xvst VT1, Y, 0 * SIZE
+#ifdef DOUBLE
+    XVMUL VT0, VX3, VXS
+    XVMUL VT1, VX1, VXS
+    xvfsub.d VT1, VXZ, VT1
+    xvst VT0, X, 4 * SIZE
+    xvst VT1, Y, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // C==0 S==0
+    xvst VXZ, X, 0 * SIZE
+    xvst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvst VXZ, X, 4 * SIZE
+    xvst VXZ, Y, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L120
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L122 // C!=0 S==0
+    b .L121 // C!=0 S!=0
+    .align 3
+
+.L120:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L124 // C==0 S==0
+    b .L123 // C==0 S!=0
+    .align 3
+
+.L121: // C!=0 S!=0
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    ld.d  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d  t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d  t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d  t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+#else
+    ld.w  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w  t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w  t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w  t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+#endif
+    XVMUL VT0, VX0, VXC
+    XVFMADD VT0, VX2, VXS, VT0
+    XVMUL VT1, VX0, VXS
+    XVMSUB VT1, VX2, VXC, VT1
+
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvst VT0, X, 0 * SIZE
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    XVMUL VT0, VX1, VXC
+    XVFMADD VT0, VX3, VXS, VT0
+    XVMUL VT1, VX1, VXS
+    XVMSUB VT1, VX3, VXC, VT1
+    xvst VT0, X, 4 * SIZE
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+#else
+    xvst VT0, X, 0 * SIZE
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L122: // C!=0 S==0
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX0, VXC
+    xvfmul.d VT1, VX2, VXC
+    xvld VX1, X, 4 * SIZE
+    xvst VT0, X, 0 * SIZE
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX1, VXC
+    xvfmul.d VT1, VX3, VXC
+    addi.d  I, I, -1
+    xvst VT0, X, 4 * SIZE
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VT0, VX0, VXC
+    xvfmul.s VT1, VX2, VXC
+    xvst VT0, X, 0 * SIZE
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L122
+    b .L997
+    .align 3
+
+.L123: // C==0 S!=0
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX2, VXS
+    xvfmul.d VT1, VX0, VXS
+    xvfsub.d VT1, VXZ, VT1
+    xvst VT0, X, 0 * SIZE
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX3, VXS
+    xvfmul.d VT1, VX1, VXS
+    xvfsub.d VT1, VXZ, VT1
+    addi.d  I, I, -1
+    xvst VT0, X, 4 * SIZE
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VT0, VX2, VXS
+    xvfmul.s VT1, VX0, VXS
+    xvfsub.s VT1, VXZ, VT1
+    xvst VT0, X, 0 * SIZE
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L123
+    b .L997
+    .align 3
+
+.L124: // C==0 S==0
+    xvst VXZ, X, 0 * SIZE
+#ifdef DOUBLE
+    xvst VXZ, X, 0 * SIZE
+    xvst VXZ, X, 4 * SIZE
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+#else
+    xvst VXZ, X, 0 * SIZE
+    xvstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L124
+    move Y, YY
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L210
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L212 // C!=0 S==0
+    b .L211 // C!=0 S!=0
+    .align 3
+
+.L210:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L214 // C==0 S==0
+    b .L213 // C==0 S!=0
+    .align 3
+
+.L211: // C!=0 S!=0
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VT0, VXC, VX0
+    xvfmadd.d VT0, VX2, VXS, VT0
+    xvfmul.d VT1, VXS, VX0
+    xvfmsub.d VT1, VX2, VXC, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VT0, VX1, VXC
+    xvfmadd.d VT0, VX3, VXS, VT0
+    xvfmul.d VT1, VX1, VXS
+    xvfmsub.d VT1, VX3, VXC, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 4 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VT0, VXC, VX0
+    xvfmadd.s VT0, VX2, VXS, VT0
+    xvfmul.s VT1, VX0, VXS
+    xvfmsub.s VT1, VX2, VXC, VT1
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L212: // C!=0 S==0
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VT0, VXC, VX0
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvfmul.d VT1, VX2, VXC
+    xvst VT1, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VT0, VX1, VXC
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvfmul.d VT1, VX3, VXS
+    xvst VT1, Y, 4 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VT0, VXC, VX0
+    xvfmul.s VT1, VX2, VXC
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L212
+    b .L997
+    .align 3
+
+.L213: // C==0 S!=0
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VT0, VXS, VX2
+    xvfmul.d VT1, VXS, VX0
+    xvfsub.d VT1, VXZ, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VT0, VX3, VXS
+    xvfmul.d VT1, VX1, VXS
+    xvfsub.d VT1, VXZ, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 4 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VT0, VXS, VX2
+    xvfmul.s VT1, VXS, VX0
+    xvfsub.s VT1, VXZ, VT1
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L213
+    b .L997
+    .align 3
+
+.L214: // C==0 S==0
+#ifdef DOUBLE
+    xvstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+    xvstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 4 * SIZE
+#else
+    xvstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+    xvstelm.w VXZ, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 7
+    add.d XX, XX, INCX
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L220
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L222 // C!=0 S==0
+    b .L221 // C!=0 S!=0
+    .align 3
+
+.L220:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L224 // C==0 S==0
+    b .L223 // C==0 S!=0
+    .align 3
+
+.L221: // C!=0 S!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX0, VXC
+    xvfmadd.d VT0, VX2, VXS, VT0
+    xvfmul.d VT1, VX0, VXS
+    xvfmsub.d VT1, VX2, VXC, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX1, VXC
+    xvfmadd.d VT0, VX3, VXS, VT0
+    xvfmul.d VT1, VX0, VXS
+    xvfmsub.d VT1, VX3, VXC, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s  VT0, VX0, VXC
+    xvfmadd.s VT0, VX2, VXS, VT0
+    xvfmul.s  VT1, VX0, VXS
+    xvfmsub.s VT1, VX2, VXC, VT1
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+    add.d YY, YY, INCY
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // C!=0 S==0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX0, VXC
+    xvfmul.d VT1, VX2, VXC
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX1, VXC
+    xvfmul.d VT1, VX3, VXC
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VT0, VX0, VXC
+    xvfmul.s VT1, VX2, VXC
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+    add.d YY, YY, INCY
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // C==0 S!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX2, VXS
+    xvfmul.d VT1, VX0, VXS
+    xvfsub.d VT1, VXZ, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX3, VXS
+    xvfmul.d VT1, VX0, VXS
+    xvfsub.d VT1, VXZ, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VT0, VX2, VXS
+    xvfmul.s VT1, VX0, VXS
+    xvfsub.s VT1, VXZ, VT1
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+    add.d YY, YY, INCY
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // C==0 S==0
+#ifdef DOUBLE
+    xvstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+#else
+    xvstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 7
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+#ifdef DOUBLE
+    move X, XX
+    move Y, YY
+#endif
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   $f12, X, 0 * SIZE
+    LD   $f13, Y, 0 * SIZE
+    MUL  $f10, $f12, C
+    MADD $f10, $f13, S, $f10
+    ST   $f10, X, 0 * SIZE
+    addi.d I, I, -1
+    MUL  $f20, $f12, S
+    MSUB $f20, $f13, C, $f20
+    ST   $f20, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/rot_lsx.S b/kernel/loongarch64/rot_lsx.S
new file mode 100644
index 000000000..4b0e59310
--- /dev/null
+++ b/kernel/loongarch64/rot_lsx.S
@@ -0,0 +1,1791 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define C      $f0
+#define S      $f1
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VT0    $vr10
+#define VT1    $vr18
+#define VXC    $vr23
+#define VXS    $vr9
+#define VXZ    $vr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT a1, a1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+#ifdef DOUBLE
+    movfr2gr.d t1, C
+    vreplgr2vr.d VXC, t1
+    movfr2gr.d t2, S
+    vreplgr2vr.d VXS, t2
+    movfr2gr.d t3, a1
+    vreplgr2vr.d VXZ, t3
+#else
+    movfr2gr.s t1, C
+    vreplgr2vr.w VXC, t1
+    movfr2gr.s t2, S
+    vreplgr2vr.w VXS, t2
+    movfr2gr.s t3, a1
+    vreplgr2vr.w VXZ, t3
+#endif
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L110
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L112 // C!=0 S==0
+    b .L111 // C!=0 S!=0
+    .align 3
+
+.L110:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L114 // C==0 S==0
+    b .L113 // C==0 S!=0
+    .align 3
+
+.L111: // C!=0 S!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+#endif
+    VMUL VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+    vst VT0, X, 0 * SIZE
+    vst VT1, Y, 0 * SIZE
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+#ifdef DOUBLE
+    vst VT0, X, 2 * SIZE
+    vst VT1, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    VMUL VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+#endif
+    vst VT0, X, 4 * SIZE
+    vst VT1, Y, 4 * SIZE
+#ifdef DOUBLE
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vst VT0, X, 6 * SIZE
+    vst VT1, Y, 6 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // C!=0 S==0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+#endif
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+    vst VT0, X, 0 * SIZE
+    vst VT1, Y, 0 * SIZE
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+#ifdef DOUBLE
+    vst VT0, X, 2 * SIZE
+    vst VT1, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+#endif
+    vst VT0, X, 4 * SIZE
+    vst VT1, Y, 4 * SIZE
+#ifdef DOUBLE
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vst VT0, X, 6 * SIZE
+    vst VT1, Y, 6 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // C==0 S!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+#endif
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 0 * SIZE
+    vst VT1, Y, 0 * SIZE
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+#ifdef DOUBLE
+    vst VT0, X, 2 * SIZE
+    vst VT1, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+#endif
+    vst VT0, X, 4 * SIZE
+    vst VT1, Y, 4 * SIZE
+#ifdef DOUBLE
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 6 * SIZE
+    vst VT1, Y, 6 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // C==0 S==0
+    vst VXZ, X, 0 * SIZE
+    vst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    vst VXZ, X, 2 * SIZE
+    vst VXZ, Y, 2 * SIZE
+#endif
+    vst VXZ, X, 4 * SIZE
+    vst VXZ, Y, 4 * SIZE
+#ifdef DOUBLE
+    vst VXZ, X, 6 * SIZE
+    vst VXZ, Y, 6 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L120
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L122 // C!=0 S==0
+    b .L121 // C!=0 S!=0
+    .align 3
+
+.L120:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L124 // C==0 S==0
+    b .L123 // C==0 S!=0
+    .align 3
+
+.L121: // C!=0 S!=0
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    ld.d  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d  t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w  t2, Y, 0 * SIZE
+#endif
+    add.d Y, Y, INCY
+#ifndef DOUBLE
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+#endif
+    VMUL VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+    vst VT0, X, 0 * SIZE
+#ifdef DOUBLE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX0, X, 2 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+    vst VT0, X, 2 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+#else
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+#endif
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vst VT0, X, 4 * SIZE
+#ifdef DOUBLE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vst VT0, X, 6 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+#else
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L122: // C!=0 S==0
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    ld.d  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d  t2, Y, 0 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w  t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+#else
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+#endif
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+    vst VT0, X, 0 * SIZE
+#ifdef DOUBLE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX0, X, 2 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+    vst VT0, X, 2 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vst VT0, X, 4 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vst VT0, X, 6 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+#else
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vst VT0, X, 4 * SIZE
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L122
+    b .L997
+    .align 3
+
+.L123: // C==0 S!=0
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    ld.d  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d  t2, Y, 0 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w  t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+#else
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+#endif
+    add.d Y, Y, INCY
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 0 * SIZE
+#ifdef DOUBLE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX0, X, 2 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 2 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 4 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 6 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+#else
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 4 * SIZE
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L123
+    b .L997
+    .align 3
+
+.L124: // C==0 S==0
+    vst VXZ, X, 0 * SIZE
+    vst VXZ, X, 4 * SIZE
+#ifdef DOUBLE
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+#else
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L124
+#ifdef DOUBLE
+    move Y, YY
+#endif
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L210
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L212 // C!=0 S==0
+    b .L211 // C!=0 S!=0
+    .align 3
+
+.L210:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L214 // C==0 S==0
+    b .L213 // C==0 S!=0
+    .align 3
+
+.L211: // C!=0 S!=0
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    ld.d  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d  t2, X, 0 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w  t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+#else
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+#endif
+    add.d X, X, INCX
+    VMUL VT0, VXC, VX0
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VXS, VX0
+    VMSUB VT1, VX2, VXC, VT1
+#ifdef DOUBLE
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX2, Y, 2 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    add.d X, X, INCX
+    VMUL VT0, VXC, VX0
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VXS, VX0
+    VMSUB VT1, VX2, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 2 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 6 * SIZE
+#else
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L212: // C!=0 S==0
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    ld.d  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d  t2, X, 0 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w  t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+#else
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+#endif
+    add.d X, X, INCX
+    VMUL VT0, VXC, VX0
+    VMUL VT1, VX2, VXC
+
+#ifdef DOUBLE
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX2, Y, 2 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    add.d X, X, INCX
+    VMUL VT0, VXC, VX0
+    VMUL VT1, VX2, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 2 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXS
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXS
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    vst VT1, Y, 6 * SIZE
+#else
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXS
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L212
+    b .L997
+    .align 3
+
+.L213: // C==0 S!=0
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    ld.d  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d  t2, X, 0 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w  t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+#else
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+#endif
+    add.d X, X, INCX
+    VMUL VT0, VXS, VX2
+    VMUL VT1, VXS, VX0
+    VFSUB VT1, VXZ, VT1
+
+#ifdef DOUBLE
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX2, Y, 2 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    add.d X, X, INCX
+    VMUL VT0, VXS, VX2
+    VMUL VT1, VXS, VX0
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 2 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 6 * SIZE
+#else
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L213
+    b .L997
+    .align 3
+
+.L214: // C==0 S==0
+#ifdef DOUBLE
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+#else
+    vstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 3
+#endif
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+#ifdef DOUBLE
+    move X, XX
+#endif
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L220
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L222 // C!=0 S==0
+    b .L221 // C!=0 S!=0
+    .align 3
+
+.L220:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L224 // C==0 S==0
+    b .L223 // C==0 S!=0
+    .align 3
+
+.L221: // C!=0 S!=0
+#ifdef DOUBLE
+    ld.d  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d  t2, X, 0 * SIZE
+    add.d X, X, INCX
+#else
+    ld.w  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w  t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    VMUL  VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL  VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+#else
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // C!=0 S==0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+#ifndef DOUBLE
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+#else
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // C==0 S!=0
+#ifdef DOUBLE
+    ld.d  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d  t2, X, 0 * SIZE
+    add.d X, X, INCX
+#else
+    ld.w  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w  t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+#else
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // C==0 S==0
+#ifdef DOUBLE
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+#else
+    vstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+#ifdef DOUBLE
+    move X, XX
+    move Y, YY
+#endif
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   $f12, X, 0 * SIZE
+    LD   $f13, Y, 0 * SIZE
+    MUL  $f10, $f12, C
+    MADD $f10, $f13, S, $f10
+    ST   $f10, X, 0 * SIZE
+    addi.d I, I, -1
+    MUL  $f20, $f12, S
+    MSUB $f20, $f13, C, $f20
+    ST   $f20, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/scal_lasx.S b/kernel/loongarch64/scal_lasx.S
new file mode 100644
index 000000000..48e2c0718
--- /dev/null
+++ b/kernel/loongarch64/scal_lasx.S
@@ -0,0 +1,282 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define ALPHA $f0
+#define X  $r7
+#define INCX $r8
+#define I $r12
+#define TEMP $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define XX $r16
+#define VX0 $xr12
+#define VX1 $xr13
+#define VT0 $xr14
+#define VT1 $xr15
+#define VALPHA $xr19
+#define a1 $f8
+#define a2 $f23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT  a1, a1
+    movgr2fr.d a2, TEMP
+    FFINT  a2, a2
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    CMPEQ  $fcc0, ALPHA, a1
+    bcnez  $fcc0, .L20   //ALPHA==0
+    CMPEQ  $fcc0, ALPHA, a2
+    bcnez $fcc0, .L999  //ALPHA==1 return
+    srai.d I, N, 3
+    beq INCX, TEMP, .L30    //ALPHA！=0|1 and INCX==1
+    MTG  TEMP, ALPHA
+#ifdef DOUBLE
+    xvreplgr2vr.d VALPHA, TEMP
+#else
+    xvreplgr2vr.w VALPHA, TEMP
+#endif
+    move XX, X
+    .align 3
+
+.L10:   //ALPHA！=0|1 and INCX!=1
+    bge $r0, I, .L32
+    .align 3
+.L11:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    xvfmul.d VT0, VX0, VALPHA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvfmul.d VT1, VX1, VALPHA
+    xvstelm.d VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT1, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT1, XX, 0, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfmul.s VT0, VX0, VALPHA
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+#endif
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    b .L32
+    .align 3
+
+.L20:
+    srai.d I, N, 3
+    beq INCX, TEMP, .L24
+    bge $r0, I, .L22
+    .align 3
+
+.L21:
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L23:
+    ST  a1, X, 0 * SIZE
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L23
+    jirl $r0, $r1, 0
+    .align 3
+
+.L24:
+    bge $r0, I, .L26 /*N<8 INCX==1*/
+    .align 3
+.L25:
+    xvxor.v VX0, VX0, VX0
+    xvst VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvst VX0, X, 4 * SIZE
+#endif
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L25
+    .align 3
+
+.L26:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L27:
+    ST  a1, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, SIZE
+    blt $r0, I, .L27
+    jirl $r0, $r1, 0
+    .align 3
+
+.L30:
+    bge $r0, I, .L32/*N<8 INCX==1*/
+    MTG TEMP, ALPHA
+#ifdef DOUBLE
+    xvreplgr2vr.d VALPHA , TEMP
+#else
+    xvreplgr2vr.w VALPHA , TEMP
+#endif
+    .align 3
+
+.L31:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VT0, VX0, VALPHA
+    xvfmul.d VT1, VX1, VALPHA
+    xvst VT0, X, 0 * SIZE
+    xvst VT1, X, 4 * SIZE
+#else
+    xvfmul.s VT0, VX0, VALPHA
+    xvst VT0, X, 0 * SIZE
+#endif
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L31
+    .align 3
+
+.L32:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L33:
+    LD    a1, X, 0 * SIZE
+    addi.d I, I, -1
+    MUL   a1, ALPHA, a1
+    ST    a1, X, 0 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L33
+    jirl $r0, $r1, 0
+    .align 3
+
+.L999:
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/scal_lsx.S b/kernel/loongarch64/scal_lsx.S
new file mode 100644
index 000000000..1ffce7db2
--- /dev/null
+++ b/kernel/loongarch64/scal_lsx.S
@@ -0,0 +1,301 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define ALPHA $f0
+#define X  $r7
+#define INCX $r8
+#define I $r12
+#define TEMP $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define XX $r16
+#define VX0 $vr12
+#define VX1 $vr13
+#define VT0 $vr14
+#define VT1 $vr15
+#define VALPHA $vr19
+#define a1 $f8
+#define a2 $f23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT  a1, a1
+    movgr2fr.d a2, TEMP
+    FFINT  a2, a2
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    CMPEQ  $fcc0, ALPHA, a1
+    bcnez  $fcc0, .L20   //ALPHA==0
+    CMPEQ  $fcc0, ALPHA, a2
+    bcnez $fcc0, .L999  //ALPHA==1 return
+    srai.d I, N, 3
+    beq INCX, TEMP, .L30    //ALPHA！=0|1 and INCX==1
+    MTG  TEMP, ALPHA
+#ifdef DOUBLE
+    vreplgr2vr.d VALPHA, TEMP
+#else
+    vreplgr2vr.w VALPHA, TEMP
+#endif
+    move XX, X
+    .align 3
+
+.L10:   //ALPHA！=0|1 and INCX!=1
+    bge $r0, I, .L32
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vfmul.d VT0, VX0, VALPHA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vfmul.d VT1, VX1, VALPHA
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vstelm.d VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    vfmul.d VT0, VX0, VALPHA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vfmul.d VT1, VX1, VALPHA
+    vstelm.d VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT1, XX, 0, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vfmul.s VT0, VX0, VALPHA
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vfmul.s VT1, VX1, VALPHA
+    vstelm.w VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT1, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT1, XX, 0, 3
+#endif
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    b .L32
+    .align 3
+
+.L20:
+    srai.d I, N, 3
+    beq INCX, TEMP, .L24
+    bge $r0, I, .L22
+    .align 3
+
+.L21:
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L23:
+    ST  a1, X, 0 * SIZE
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L23
+    jirl $r0, $r1, 0
+    .align 3
+
+.L24:
+    bge $r0, I, .L26 /*N<8 INCX==1*/
+    .align 3
+
+.L25:
+    vxor.v VX0, VX0, VX0
+    vst VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vst VX0, X, 2 * SIZE
+    vst VX0, X, 4 * SIZE
+    vst VX0, X, 6 * SIZE
+#else
+    vst VX0, X, 4 * SIZE
+#endif
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L25
+    .align 3
+
+.L26:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L27:
+    ST  a1, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, SIZE
+    blt $r0, I, .L27
+    jirl $r0, $r1, 0
+    .align 3
+
+.L30:
+    bge $r0, I, .L32/*N<8 INCX==1*/
+    MTG  TEMP, ALPHA
+#ifdef DOUBLE
+    vreplgr2vr.d VALPHA , TEMP
+#else
+    vreplgr2vr.w VALPHA , TEMP
+#endif
+    .align 3
+
+.L31:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VT0, VX0, VALPHA
+    vfmul.d VT1, VX1, VALPHA
+    vld VX0, X, 4 * SIZE
+    vst VT0, X, 0 * SIZE
+    vst VT1, X, 2 * SIZE
+    vfmul.d VT0, VX0, VALPHA
+    vld VX1, X, 6 * SIZE
+    vst VT0, X, 4 * SIZE
+    vfmul.d VT1, VX1, VALPHA
+    vst VT1, X, 6 * SIZE
+    addi.d I, I, -1
+#else
+    vld VX1, X, 4 * SIZE
+    vfmul.s VT0, VX0, VALPHA
+    vfmul.s VT1, VX1, VALPHA
+    addi.d I, I, -1
+    vst VT0, X, 0 * SIZE
+    vst VT1, X, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L31
+    .align 3
+
+.L32:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L33:
+    LD    a1, X, 0 * SIZE
+    addi.d I, I, -1
+    MUL   a1, ALPHA, a1
+    ST    a1, X, 0 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L33
+    jirl $r0, $r1, 0
+    .align 3
+
+.L999:
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/snrm2_lasx.S b/kernel/loongarch64/snrm2_lasx.S
new file mode 100644
index 000000000..3ae11e897
--- /dev/null
+++ b/kernel/loongarch64/snrm2_lasx.S
@@ -0,0 +1,153 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+
+/* Don't change following FR unless you know the effects. */
+#define VX0    $xr15
+#define VX1    $xr16
+#define VX2    $xr17
+#define VX3    $xr18
+#define VX4    $xr21
+#define res1   $xr19
+#define res2   $xr20
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   xvxor.v res1, res1, res1
+   xvxor.v res2, res2, res2
+   bge $r0, N, .L999
+   beq $r0, INCX, .L999
+   li.d  TEMP, SIZE
+   slli.d INCX, INCX, BASE_SHIFT
+   srai.d I, N, 3
+   bne INCX, TEMP, .L20
+   bge $r0,  I, .L997
+   .align 3
+
+.L10:
+   xvld VX0, X, 0
+   xvfcvtl.d.s VX1, VX0
+   xvfcvth.d.s VX2, VX0
+   xvfmadd.d res1, VX1, VX1, res1
+   xvfmadd.d res2, VX2, VX2, res2
+   addi.d I, I, -1
+   addi.d X, X, 8 * SIZE
+   blt $r0, I, .L10
+   .align 3
+   b .L996
+
+.L20:
+   bge $r0, I, .L997
+   .align 3
+
+.L21:
+   ld.w t1, X, 0
+   add.d X, X, INCX
+   ld.w t2, X, 0
+   add.d X, X, INCX
+   ld.w t3, X, 0
+   add.d X, X, INCX
+   ld.w t4, X, 0
+   add.d X, X, INCX
+   xvinsgr2vr.w VX0, t1, 0
+   xvinsgr2vr.w VX0, t2, 1
+   xvinsgr2vr.w VX0, t3, 2
+   xvinsgr2vr.w VX0, t4, 3
+   ld.w t1, X, 0
+   add.d X, X, INCX
+   ld.w t2, X, 0
+   add.d X, X, INCX
+   ld.w t3, X, 0
+   add.d X, X, INCX
+   ld.w t4, X, 0
+   xvinsgr2vr.w VX0, t1, 4
+   xvinsgr2vr.w VX0, t2, 5
+   xvinsgr2vr.w VX0, t3, 6
+   xvinsgr2vr.w VX0, t4, 7
+   add.d X, X, INCX
+   xvfcvtl.d.s VX1, VX0
+   xvfcvth.d.s VX2, VX0
+   xvfmadd.d res1, VX1, VX1, res1
+   xvfmadd.d res2, VX2, VX2, res2
+   addi.d  I, I, -1
+   blt $r0, I, .L21
+   b .L996
+
+.L996:
+   xvfadd.d res1, res1, res2
+   xvpickve.d VX1, res1, 1
+   xvpickve.d VX2, res1, 2
+   xvpickve.d VX3, res1, 3
+   fadd.d   $f19, $f19, $f16
+   fadd.d   $f19, $f19, $f17
+   fadd.d   $f19, $f19, $f18
+   .align 3
+
+.L997:
+   andi I, N, 7
+   bge $r0, I, .L999
+   .align 3
+
+.L998:
+   fld.s $f15, X, 0
+   add.d    X, X, INCX
+   addi.d   I, I, -1
+   fcvt.d.s $f15, $f15
+   fmadd.d  $f19, $f15, $f15, $f19
+   blt $r0, I, .L998
+   .align 3
+
+.L999:
+   fsqrt.d $f19, $f19
+   move $r4, $r17
+   fcvt.s.d $f0, $f19
+   jirl $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/snrm2_lsx.S b/kernel/loongarch64/snrm2_lsx.S
new file mode 100644
index 000000000..bb492dbf0
--- /dev/null
+++ b/kernel/loongarch64/snrm2_lsx.S
@@ -0,0 +1,159 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+#define VX0    $vr15
+#define VX1    $vr16
+#define VX2    $vr17
+#define VX3    $vr18
+#define VX4    $vr21
+#define VX5    $vr22
+/* Don't change following FR unless you know the effects. */
+#define res1   $vr19
+#define res2   $vr20
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+   vxor.v res1, res1, res1
+   vxor.v res2, res2, res2
+   bge $r0,    N, .L999
+   beq $r0, INCX, .L999
+   li.d  TEMP, SIZE
+   slli.d INCX, INCX, BASE_SHIFT
+   srai.d I, N, 3
+   bne INCX, TEMP, .L20
+   bge  $r0,    I, .L997
+   .align 3
+
+.L10:
+   vld  VX0, X, 0
+   vld  VX5, X, 4 * SIZE
+   addi.d I, I, -1
+   addi.d X, X, 8 * SIZE
+   vfcvtl.d.s VX1, VX0
+   vfcvth.d.s VX2, VX0
+   vfcvtl.d.s VX3, VX5
+   vfcvth.d.s VX4, VX5
+   vfmadd.d  res1, VX1, VX1, res1
+   vfmadd.d  res2, VX2, VX2, res2
+   vfmadd.d  res1, VX3, VX3, res1
+   vfmadd.d  res2, VX4, VX4, res2
+   blt $r0, I, .L10
+   b .L996
+   .align 3
+
+.L20:
+   bge $r0, I, .L997
+   .align 3
+
+.L21:
+   ld.w t1, X, 0
+   add.d X, X, INCX
+   ld.w t2, X, 0
+   add.d X, X, INCX
+   ld.w t3, X, 0
+   add.d X, X, INCX
+   ld.w t4, X, 0
+   add.d X, X, INCX
+   vinsgr2vr.w VX0, t1, 0
+   vinsgr2vr.w VX0, t2, 1
+   vinsgr2vr.w VX0, t3, 2
+   vinsgr2vr.w VX0, t4, 3
+   vfcvtl.d.s VX1, VX0
+   vfcvth.d.s VX2, VX0
+   vfmadd.d res1, VX1, VX1, res1
+   vfmadd.d res2, VX2, VX2, res2
+   ld.w t1, X, 0
+   add.d X, X, INCX
+   ld.w t2, X, 0
+   add.d X, X, INCX
+   ld.w t3, X, 0
+   add.d X, X, INCX
+   ld.w t4, X, 0
+   add.d X, X, INCX
+   vinsgr2vr.w VX0, t1, 0
+   vinsgr2vr.w VX0, t2, 1
+   vinsgr2vr.w VX0, t3, 2
+   vinsgr2vr.w VX0, t4, 3
+   vfcvtl.d.s VX3, VX0
+   vfcvth.d.s VX4, VX0
+   vfmadd.d res1, VX3, VX3, res1
+   vfmadd.d res2, VX4, VX4, res2
+   addi.d  I, I, -1
+   blt $r0, I, .L21
+   b .L996
+   .align 3
+
+.L996:
+   vfadd.d res1, res1, res2
+   vreplvei.d VX1, res1, 1
+   vfadd.d res1, VX1, res1
+   .align 3
+
+.L997:
+   andi I, N, 7
+   bge $r0, I, .L999
+   .align 3
+
+.L998:
+   fld.s $f15, X, 0
+   addi.d I, I, -1
+   fcvt.d.s $f15, $f15
+   fmadd.d $f19, $f15, $f15, $f19
+   add.d X, X, INCX
+   blt $r0, I, .L998
+   .align 3
+
+.L999:
+   fsqrt.d $f19, $f19
+   move $r4, $r17
+   fcvt.s.d $f0, $f19
+   jirl $r0, $r1, 0x0
+   .align 3
+
+   EPILOGUE
diff --git a/kernel/loongarch64/sum_lasx.S b/kernel/loongarch64/sum_lasx.S
new file mode 100644
index 000000000..fd6d5adb3
--- /dev/null
+++ b/kernel/loongarch64/sum_lasx.S
@@ -0,0 +1,225 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define res1   $xr16
+#define res2   $xr17
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+    xvld VX0, X, 0
+    xvfadd.s res1, res1, VX0
+#ifdef DOUBLE
+    xvld VX1, X, 32
+    xvfadd.s res1, res1, VX1
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD  $f12, X, 0
+    ADD $f16, $f12, $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfadd.s res1, VX0, res1
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD  $f12, X, 0
+    ADD $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/sum_lsx.S b/kernel/loongarch64/sum_lsx.S
new file mode 100644
index 000000000..6b2027781
--- /dev/null
+++ b/kernel/loongarch64/sum_lsx.S
@@ -0,0 +1,204 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define res1   $vr16
+#define res2   $vr17
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+    vld VX0, X, 0
+    vld VX1, X, 16
+    VFADD res2, VX0, VX1
+    VFADD res1, res1, res2
+#ifdef DOUBLE
+    vld VX0, X, 32
+    vld VX1, X, 48
+    VFADD res2, VX0, VX1
+    VFADD res1, res1, res2
+#endif
+    addi.d  X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d   res1, res1, VX1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD   $f12, X, 0
+    ADD  $f16, $f12, $f16
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfadd.s res2, VX0, VX1
+    vfadd.s res1, res1, res2
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD  $f12, X, 0
+    ADD $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt  $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/swap_lasx.S b/kernel/loongarch64/swap_lasx.S
new file mode 100644
index 000000000..4767fffe3
--- /dev/null
+++ b/kernel/loongarch64/swap_lasx.S
@@ -0,0 +1,401 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+/* INCX==1 and INCY==1 */
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    xvld VX0, X, 0
+    xvld VX2, Y, 0
+    addi.d  I, I, -1
+    xvst VX2, X, 0
+    xvst VX0, Y, 0
+#ifdef DOUBLE
+    xvld VX0, X, 32
+    xvld VX2, Y, 32
+    xvst VX2, X, 32
+    xvst VX0, Y, 32
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+/* INCX==1 and INCY!=1 */
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    ld.d t1, Y, 0
+    xvstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0
+    xvstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0
+    xvstelm.d VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0
+    xvstelm.d VX0, Y, 0, 3
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvst VX2, X, 0
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0
+    xvstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0
+    xvstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0
+    xvstelm.d VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0
+    xvstelm.d VX1, Y, 0, 3
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvst VX3, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    xvld VX0, X, 0
+    ld.w t1, Y, 0
+    xvstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0
+    xvstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0
+    xvstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0
+    xvstelm.w VX0, Y, 0, 3
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0
+    xvstelm.w VX0, Y, 0, 4
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0
+    xvstelm.w VX0, Y, 0, 5
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0
+    xvstelm.w VX0, Y, 0, 6
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0
+    xvstelm.w VX0, Y, 0, 7
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvst VX2, X, 0
+    addi.d X, X, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    addi.d  X, X, SIZE
+    add.d   Y, Y, INCY
+    blt   $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    xvld VX2, Y, 0
+    ld.d t1, X, 0
+    xvstelm.d VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    xvstelm.d VX2, X, 0, 1
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    xvstelm.d VX2, X, 0, 2
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    xvstelm.d VX2, X, 0, 3
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvst VX0, Y, 0
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0
+    xvstelm.d VX3, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    xvstelm.d VX3, X, 0, 1
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    xvstelm.d VX3, X, 0, 2
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    xvstelm.d VX3, X, 0, 3
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvst VX1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX2, Y, 0
+    ld.w t1, X, 0
+    xvstelm.w VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    xvstelm.w VX2, X, 0, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    xvstelm.w VX2, X, 0, 2
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    xvstelm.w VX2, X, 0, 3
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0
+    xvstelm.w VX2, X, 0, 4
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    xvstelm.w VX2, X, 0, 5
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    xvstelm.w VX2, X, 0, 6
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    xvstelm.w VX2, X, 0, 7
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvst VX0, Y, 0
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    add.d  X, X, INCX
+    addi.d Y, Y, SIZE
+    blt  $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+    move XX, X
+
+.L222:
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    LD    b1, Y, 0
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    LD    b2, Y, 0
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    LD    b3, Y, 0
+    ST    a3, Y, 0
+    add.d Y, Y, INCY
+    LD    b4, Y, 0
+    ST    a4, Y, 0
+    add.d Y, Y, INCY
+    LD    a1, X, 0
+    add.d X, X, INCX
+    ST    b1, XX, 0
+    add.d XX, XX, INCX
+    LD    b1, Y, 0
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    LD    a2, X, 0
+    add.d X, X, INCX
+    ST    b2, XX, 0
+    add.d XX, XX, INCX
+    LD    b2, Y, 0
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    LD    a3, X, 0
+    add.d X, X, INCX
+    ST    b3, XX, 0
+    add.d XX, XX, INCX
+    LD    b3, Y, 0
+    ST    a3, Y, 0
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    b4, XX, 0
+    add.d XX, XX, INCX
+    LD    b4, Y, 0
+    ST    a4, Y, 0
+    add.d Y, Y, INCY
+    ST    b1, XX, 0
+    add.d XX, XX, INCX
+    ST    b2, XX, 0
+    add.d XX, XX, INCX
+    ST    b3, XX, 0
+    add.d XX, XX, INCX
+    ST    b4, XX, 0
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt  $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/swap_lsx.S b/kernel/loongarch64/swap_lsx.S
new file mode 100644
index 000000000..736187f93
--- /dev/null
+++ b/kernel/loongarch64/swap_lsx.S
@@ -0,0 +1,431 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+/* INCX==1 and incy==1 */
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vld VX2, Y, 0
+    vld VX3, Y, 16
+    addi.d  I, I, -1
+    vst VX2, X, 0
+    vst VX3, X, 16
+    vst VX0, Y, 0
+    vst VX1, Y, 16
+#ifdef DOUBLE
+    vld VX0, X, 32
+    vld VX1, X, 48
+    vld VX2, Y, 32
+    vld VX3, Y, 48
+    vst VX2, X, 32
+    vst VX3, X, 48
+    vst VX0, Y, 32
+    vst VX1, Y, 48
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+#ifdef DOUBLE
+    fld.d $f12, X, 0
+    fld.d $f14, Y, 0
+    addi.d I, I, -1
+    fst.d $f12, Y, 0
+    fst.d $f14, X, 0
+#else
+    fld.s $f12, X, 0
+    fld.s $f14, Y, 0
+    addi.d I, I, -1
+    fst.s $f12, Y, 0
+    fst.s $f14, X, 0
+#endif
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+/* INCX==1 and INCY!=1 */
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    ld.d t1, Y, 0
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0
+    vstelm.d VX0, Y, 0, 1
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vst VX2, X, 0
+    vld VX1, X, 2 * SIZE
+    ld.d t3, Y, 0
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0
+    vstelm.d VX1, Y, 0, 1
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vst VX3, X, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    ld.d t1, Y, 0
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0
+    vstelm.d VX0, Y, 0, 1
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vst VX2, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0
+    vstelm.d VX1, Y, 0, 1
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vst VX3, X, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    vld VX0, X, 0
+    ld.w t1, Y, 0
+    vstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0
+    vstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0
+    vstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0
+    vstelm.w VX0, Y, 0, 3
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vst VX2, X, 0
+
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0
+    vstelm.w VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0
+    vstelm.w VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0
+    vstelm.w VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0
+    vstelm.w VX1, Y, 0, 3
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vst VX3, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    addi.d X, X, SIZE
+    add.d  Y, Y, INCY
+    blt  $r0, I, .L123
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY==1 */
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    vld VX2, Y, 0
+    ld.d t1, X, 0
+    vstelm.d VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    vstelm.d VX2, X, 0, 1
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vst VX0, Y, 0
+    vld VX3, Y, 2 * SIZE
+    ld.d t3, X, 0
+    vstelm.d VX3, X, 0, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    vstelm.d VX3, X, 0, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vst VX1, Y, 2 * SIZE
+    vld VX2, Y, 4 * SIZE
+    ld.d t1, X, 0
+    vstelm.d VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    vstelm.d VX2, X, 0, 1
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vst VX0, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0
+    vstelm.d VX3, X, 0, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    vstelm.d VX3, X, 0, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vst  VX1, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    vld VX2, Y, 0
+    ld.w t1, X, 0
+    vstelm.w VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    vstelm.w VX2, X, 0, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    vstelm.w VX2, X, 0, 2
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    vstelm.w VX2, X, 0, 3
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vst VX0, Y, 0
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0
+    vstelm.w VX3, X, 0, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    vstelm.w VX3, X, 0, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    vstelm.w VX3, X, 0, 2
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    vstelm.w VX3, X, 0, 3
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    vst  VX1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    ST  $f12, Y, 0 * SIZE
+    ST  $f14, X, 0 * SIZE
+    add.d  X, X, INCX
+    addi.d Y, Y, SIZE
+    blt  $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+    move XX, X
+
+.L222:
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    LD    b1, Y, 0
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    LD    b2, Y, 0
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    LD    b3, Y, 0
+    ST    a3, Y, 0
+    add.d Y, Y, INCY
+    LD    b4, Y, 0
+    ST    a4, Y, 0
+    add.d Y, Y, INCY
+    LD    a1, X, 0
+    add.d X, X, INCX
+    ST    b1, XX, 0
+    add.d XX, XX, INCX
+    LD    b1, Y, 0
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    LD    a2, X, 0
+    add.d X, X, INCX
+    ST    b2, XX, 0
+    add.d XX, XX, INCX
+    LD    b2, Y, 0
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    LD    a3, X, 0
+    add.d X, X, INCX
+    ST    b3, XX, 0
+    add.d XX, XX, INCX
+    LD    b3, Y, 0
+    ST    a3, Y, 0
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    b4, XX, 0
+    add.d XX, XX, INCX
+    LD    b4, Y, 0
+    ST    a4, Y, 0
+    add.d Y, Y, INCY
+    ST    b1, XX, 0
+    add.d XX, XX, INCX
+    ST    b2, XX, 0
+    add.d XX, XX, INCX
+    ST    b3, XX, 0
+    add.d XX, XX, INCX
+    ST    b4, XX, 0
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/zgemm_kernel_2x2.S b/kernel/loongarch64/zgemm_kernel_2x2.S
new file mode 100644
index 000000000..589d170c5
--- /dev/null
+++ b/kernel/loongarch64/zgemm_kernel_2x2.S
@@ -0,0 +1,848 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+#define c51    $f26
+#define c52    $f27
+#define c61    $f28
+#define c62    $f29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -88
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    ST         $f23,   $sp,   32
+    ST         $f24,   $sp,   40
+    ST         $f25,   $sp,   48
+    ST         $f26,   $sp,   56
+    ST         $f27,   $sp,   64
+    ST         $f28,   $sp,   72
+    ST         $f29,   $sp,   80
+
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   BASE_SHIFT
+
+    move       J,      $r0
+    srai.d     T0,     N,     1
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/2; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     C3,     OFF,   0x05
+    add.d      A0,     A0,    C3
+    add.d      B0,     B,     C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MOV        c12,    c11
+    MOV        c21,    c11
+    MOV        c22,    c11
+    MOV        c31,    c11
+    MOV        c32,    c11
+    MOV        c41,    c11
+    MOV        c42,    c11
+
+    move       L,      $r0   //cycle param k
+    srai.d     C2,     TL,     2
+    beq        L,      C2,    .L130
+    blt        C2,     L,     .L130
+
+.L12:  /* for(k=0; k<bk/4; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x08        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x08        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x10        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x18        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+    LD         b3,     B0,    0x10        //load6
+    MADD1      c31,    a1,    b3,     c31  //res4
+    MADD2      c32,    a2,    b3,     c32  //res5
+    LD         b4,     B0,    0x18        //load7
+    MADD3      c31,    a2,    b4,     c31
+    MADD4      c32,    a1,    b4,     c32
+    MADD1      c41,    a3,    b3,     c41  //res6
+    MADD2      c42,    a4,    b3,     c42  //res7
+    MADD3      c41,    a4,    b4,     c41
+    MADD4      c42,    a3,    b4,     c42
+    LD         a5,     A0,    0x20        //load8
+    LD         b5,     B0,    0x20        //load9
+    MADD1      c11,    a5,    b5,     c11
+    LD         a6,     A0,    0x28        //load10
+    MADD2      c12,    a6,    b5,     c12
+    LD         b6,     B0,    0x28        //load11
+    MADD3      c11,    a6,    b6,     c11
+    MADD4      c12,    a5,    b6,     c12
+    LD         a7,     A0,    0x30        //load12
+    MADD1      c21,    a7,    b5,     c21
+    LD         a8,     A0,    0x38        //load13
+    MADD2      c22,    a8,    b5,     c22
+    MADD3      c21,    a8,    b6,     c21
+    MADD4      c22,    a7,    b6,     c22
+    LD         b7,     B0,    0x30        //load14
+    MADD1      c31,    a5,    b7,     c31
+    MADD2      c32,    a6,    b7,     c32
+    LD         b8,     B0,    0x38        //load15
+    MADD3      c31,    a6,    b8,     c31
+    MADD4      c32,    a5,    b8,     c32
+    MADD1      c41,    a7,    b7,     c41
+    MADD2      c42,    a8,    b7,     c42
+    MADD3      c41,    a8,    b8,     c41
+    MADD4      c42,    a7,    b8,     c42
+
+    LD         a1,     A0,    0x40        //load0
+    LD         b1,     B0,    0x40        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x48        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x48        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x50        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x58        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+    LD         b3,     B0,    0x50        //load6
+    MADD1      c31,    a1,    b3,     c31  //res4
+    MADD2      c32,    a2,    b3,     c32  //res5
+    LD         b4,     B0,    0x58        //load7
+    MADD3      c31,    a2,    b4,     c31
+    MADD4      c32,    a1,    b4,     c32
+    MADD1      c41,    a3,    b3,     c41  //res6
+    MADD2      c42,    a4,    b3,     c42  //res7
+    MADD3      c41,    a4,    b4,     c41
+    MADD4      c42,    a3,    b4,     c42
+    LD         a5,     A0,    0x60        //load8
+    LD         b5,     B0,    0x60        //load9
+    MADD1      c11,    a5,    b5,     c11
+    LD         a6,     A0,    0x68        //load10
+    MADD2      c12,    a6,    b5,     c12
+    LD         b6,     B0,    0x68        //load11
+    MADD3      c11,    a6,    b6,     c11
+    MADD4      c12,    a5,    b6,     c12
+    LD         a7,     A0,    0x70        //load12
+    MADD1      c21,    a7,    b5,     c21
+    LD         a8,     A0,    0x78        //load13
+    MADD2      c22,    a8,    b5,     c22
+    MADD3      c21,    a8,    b6,     c21
+    MADD4      c22,    a7,    b6,     c22
+    LD         b7,     B0,    0x70        //load14
+    MADD1      c31,    a5,    b7,     c31
+    MADD2      c32,    a6,    b7,     c32
+    LD         b8,     B0,    0x78        //load15
+    MADD3      c31,    a6,    b8,     c31
+    MADD4      c32,    a5,    b8,     c32
+    MADD1      c41,    a7,    b7,     c41
+    MADD2      c42,    a8,    b7,     c42
+    MADD3      c41,    a8,    b8,     c41
+    MADD4      c42,    a7,    b8,     c42
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x80
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L12
+
+.L130:
+    move       L,      $r0
+    andi       C2,     TL,     3
+    beq        L,      C2,    .L14
+
+.L13:  /* for(k=0; k<(bk&3); k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x08        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x08        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x10        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x18        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+    LD         b3,     B0,    0x10        //load6
+    MADD1      c31,    a1,    b3,     c31  //res4
+    MADD2      c32,    a2,    b3,     c32  //res5
+    LD         b4,     B0,    0x18        //load7
+    MADD3      c31,    a2,    b4,     c31
+    MADD4      c32,    a1,    b4,     c32
+    MADD1      c41,    a3,    b3,     c41  //res6
+    MADD2      c42,    a4,    b3,     c42  //res7
+    MADD3      c41,    a4,    b4,     c41
+    MADD4      c42,    a3,    b4,     c42
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L13
+
+.L14:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+
+    MUL        a7,     c21,   ALPHA_R
+    MUL        a8,     c22,   ALPHA_I
+    SUB        a7,     a7,    a8
+    ST         a7,     C0,    0x10
+
+    MUL        a7,     c22,   ALPHA_R
+    MUL        a8,     c21,   ALPHA_I
+    ADD        a8,     a7,    a8
+    ST         a8,     C0,    0x18
+
+    MUL        b5,     c31,   ALPHA_R
+    MUL        b6,     c32,   ALPHA_I
+    SUB        b5,     b5,    b6
+    ST         b5,     C1,    0x00
+
+    MUL        b5,     c32,   ALPHA_R
+    MUL        b6,     c31,   ALPHA_I
+    ADD        b6,     b5,    b6
+    ST         b6,     C1,    0x08
+
+    MUL        b7,     c41,   ALPHA_R
+    MUL        b8,     c42,   ALPHA_I
+    SUB        b7,     b7,    b8
+    ST         b7,     C1,    0x10
+
+    MUL        b7,     c42,   ALPHA_R
+    MUL        b8,     c41,   ALPHA_I
+    ADD        b8,     b7,    b8
+    ST         b8,     C1,    0x18
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+    LD         a7,     C0,    0x10    //C0[2]
+    LD         a8,     C0,    0x18    //C0[3]
+    LD         b5,     C1,    0x00    //C1[0]
+    LD         b6,     C1,    0x08    //C1[1]
+    LD         b7,     C1,    0x10    //C1[2]
+    LD         b8,     C1,    0x18    //C1[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+    ST         a7,     C0,    0x10
+    ST         a8,     C0,    0x18
+
+    MADD       b5,     c31,   ALPHA_R, b5
+    MADD       b6,     c32,   ALPHA_R, b6
+    NMSUB      b5,     c32,   ALPHA_I, b5
+    MADD       b6,     c31,   ALPHA_I, b6
+    ST         b5,     C1,    0x00
+    ST         b6,     C1,    0x08
+
+    MADD       b7,     c41,   ALPHA_R, b7
+    MADD       b8,     c42,   ALPHA_R, b8
+    NMSUB      b7,     c42,   ALPHA_I, b7
+    MADD       b8,     c41,   ALPHA_I, b8
+    ST         b7,     C1,    0x10
+    ST         b8,     C1,    0x18
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x05
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L18
+
+.L15:  /* for(i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x05
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x08        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x08        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         b3,     B0,    0x10        //load4
+    MADD1      c21,    a1,    b3,     c21  //res2
+    MADD2      c22,    a2,    b3,     c22  //res3
+    LD         b4,     B0,    0x18        //load5
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+
+    MUL        b5,     c21,   ALPHA_R
+    MUL        b6,     c22,   ALPHA_I
+    SUB        b5,     b5,    b6
+    ST         b5,     C1,    0x00
+
+    MUL        b5,     c22,   ALPHA_R
+    MUL        b6,     c21,   ALPHA_I
+    ADD        b6,     b5,    b6
+    ST         b6,     C1,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+    LD         b5,     C1,    0x00    //C1[0]
+    LD         b6,     C1,    0x08    //C1[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    MADD       b5,     c21,   ALPHA_R, b5
+    MADD       b6,     c22,   ALPHA_R, b6
+    NMSUB      b5,     c22,   ALPHA_I, b5
+    MADD       b6,     c21,   ALPHA_I, b6
+    ST         b5,     C1,    0x00
+    ST         b6,     C1,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x05
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L15
+
+.L18:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+
+    slli.d     L,      K,     0x05
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x02
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     1
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x05
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x04
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x08        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x08        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x10        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x18        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+
+    MUL        a7,     c21,   ALPHA_R
+    MUL        a8,     c22,   ALPHA_I
+    SUB        a7,     a7,    a8
+    ST         a7,     C0,    0x10
+
+    MUL        a7,     c22,   ALPHA_R
+    MUL        a8,     c21,   ALPHA_I
+    ADD        a8,     a7,    a8
+    ST         a8,     C0,    0x18
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+    LD         a7,     C0,    0x10    //C1[2]
+    LD         a8,     C0,    0x18    //C1[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+    ST         a7,     C0,    0x10
+    ST         a8,     C0,    0x18
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x05
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x04
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x20
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L28
+
+.L25:  /* for (i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x08        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x08        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T1,    .L25
+
+.L28:
+    slli.d     L,      K,     4
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L20
+
+.L30:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LD         $f23,   $sp,   32
+    LD         $f24,   $sp,   40
+    LD         $f25,   $sp,   48
+    LD         $f26,   $sp,   56
+    LD         $f27,   $sp,   64
+    LD         $f28,   $sp,   72
+    LD         $f29,   $sp,   80
+
+    addi.d     $sp,    $sp,   88
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_kernel_2x2_lasx.S b/kernel/loongarch64/zgemm_kernel_2x2_lasx.S
new file mode 100644
index 000000000..152190d4f
--- /dev/null
+++ b/kernel/loongarch64/zgemm_kernel_2x2_lasx.S
@@ -0,0 +1,822 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LASX vectors */
+#define U0     $xr30
+#define U1     $xr31
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define VALPHAR $xr28
+#define VALPHAI $xr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVFMADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVFMADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    xvldrepl.d  VALPHAR, $sp, 112
+    xvldrepl.d  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   BASE_SHIFT
+
+    move       J,      $r0
+    srai.d     T0,     N,     1
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/2; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     C3,     OFF,   0x05
+    add.d      A0,     A0,    C3
+    add.d      B0,     B,     C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    move       L,      $r0   //cycle param k
+    srai.d     C2,     TL,     2
+    xvxor.v    U1,     U1,   U1
+    beq        L,      C2,    .L130
+    blt        C2,     L,     .L130
+
+.L12:  /* for(k=0; k<bk/4; k+=1) */
+    xvld       D0,     A0,    0x00  //a0 a1 a2 a3
+    xvld       D1,     B0,    0x00  //b0 b1 b2 b3
+
+    xvpermi.d  D4,     D1,    0xa0    //b0 b0 b2 b2
+    xvpermi.d  D5,     D1,    0xf5    //b1 b1 b3 b3
+    xvpermi.d  D2,     D0,    0x88    //a0 a2 a0 a2
+    xvpermi.d  D3,     D0,    0xdd    //a1 a3 a1 a3
+
+    XVMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    XVMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    XVMADD3      U0,     D3,    D5,     U0
+    XVMADD4      U1,     D2,    D5,     U1
+
+    xvld       D0,     A0,    0x20  //a0 a1 a2 a3
+    xvld       D1,     B0,    0x20  //b0 b1 b2 b3
+
+    xvpermi.d  D4,     D1,    0xa0    //b0 b0 b2 b2
+    xvpermi.d  D5,     D1,    0xf5    //b1 b1 b3 b3
+    xvpermi.d  D2,     D0,    0x88    //a0 a2 a0 a2
+    xvpermi.d  D3,     D0,    0xdd    //a1 a3 a1 a3
+
+    XVMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    XVMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    XVMADD3      U0,     D3,    D5,     U0
+    XVMADD4      U1,     D2,    D5,     U1
+
+    xvld       D0,     A0,    0x40  //a0 a1 a2 a3
+    xvld       D1,     B0,    0x40  //b0 b1 b2 b3
+
+    xvpermi.d  D4,     D1,    0xa0    //b0 b0 b2 b2
+    xvpermi.d  D5,     D1,    0xf5    //b1 b1 b3 b3
+    xvpermi.d  D2,     D0,    0x88    //a0 a2 a0 a2
+    xvpermi.d  D3,     D0,    0xdd    //a1 a3 a1 a3
+
+    XVMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    XVMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    XVMADD3      U0,     D3,    D5,     U0
+    XVMADD4      U1,     D2,    D5,     U1
+
+    xvld       D0,     A0,    0x60  //a0 a1 a2 a3
+    xvld       D1,     B0,    0x60  //b0 b1 b2 b3
+
+    xvpermi.d  D4,     D1,    0xa0    //b0 b0 b2 b2
+    xvpermi.d  D5,     D1,    0xf5    //b1 b1 b3 b3
+    xvpermi.d  D2,     D0,    0x88    //a0 a2 a0 a2
+    xvpermi.d  D3,     D0,    0xdd    //a1 a3 a1 a3
+
+    XVMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    XVMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    XVMADD3      U0,     D3,    D5,     U0
+    XVMADD4      U1,     D2,    D5,     U1
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x80
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L12
+
+.L130:
+    move       L,      $r0
+    andi       C2,     TL,     3
+    beq        L,      C2,    .L14
+
+.L13:  /* for(k=0; k<(bk&3); k+=1) */
+    xvld       D0,     A0,    0x00  //a0 a1 a2 a3
+    xvld       D1,     B0,    0x00  //b0 b1 b2 b3
+
+    xvpermi.d  D4,     D1,    0xa0    //b0 b0 b2 b2
+    xvpermi.d  D5,     D1,    0xf5    //b1 b1 b3 b3
+    xvpermi.d  D2,     D0,    0x88    //a0 a2 a0 a2
+    xvpermi.d  D3,     D0,    0xdd    //a1 a3 a1 a3
+
+    XVMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    XVMADD2      U1,     D3,    D4,     U1  //res1 3 5 7
+    XVMADD3      U0,     D3,    D5,     U0
+    XVMADD4      U1,     D2,    D5,     U1
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L13
+
+.L14:
+#if defined(TRMMKERNEL)
+    xvld       U8,     C0,    0x00  //0 1 2 3
+    xvld       U9,     C1,    0x00  //4 5 6 7
+
+    xvpackev.d U10,    U9,    U8
+    xvpermi.d  U10,    U10,   0xd8  //0 2 4 6
+    xvpackod.d U11,    U9,    U8
+    xvpermi.d  U11,    U11,   0xd8  //1 3 5 7
+
+    xvfmul.d      U10,    U0,    VALPHAR
+    xvfmul.d      U11,    U1,    VALPHAR
+    XVNMSUB     U10,    U1,    VALPHAI, U10
+    XVFMADD      U11,    U0,    VALPHAI, U11
+
+    xvand.v    U8,     U10,   U10  //0 2 4 6
+    xvpermi.q  U8,     U11,   0x02 //0 2 1 3
+    xvpermi.d  U8,     U8,    0xd8 //0 1 2 3
+
+    xvand.v    U9,     U11,   U11  //1 3 5 7
+    xvpermi.q  U9,     U10,   0x31 //4 6 5 7
+    xvpermi.d  U9,     U9,    0xd8 //4 5 6 7
+
+    xvst       U8,     C0,    0x00
+    xvst       U9,     C1,    0x00
+#else
+    xvld       U8,     C0,    0x00  //0 1 2 3
+    xvld       U9,     C1,    0x00  //4 5 6 7
+
+    xvpackev.d U10,    U9,    U8
+    xvpermi.d  U10,    U10,   0xd8  //0 2 4 6
+    xvpackod.d U11,    U9,    U8
+    xvpermi.d  U11,    U11,   0xd8  //1 3 5 7
+
+    XVFMADD      U10,    U0,    VALPHAR, U10
+    XVFMADD      U11,    U1,    VALPHAR, U11
+    XVNMSUB     U10,    U1,    VALPHAI, U10
+    XVFMADD      U11,    U0,    VALPHAI, U11
+
+    xvand.v    U8,     U10,   U10  //0 2 4 6
+    xvpermi.q  U8,     U11,   0x02 //0 2 1 3
+    xvpermi.d  U8,     U8,    0xd8 //0 1 2 3
+
+    xvand.v    U9,     U11,   U11  //1 3 5 7
+    xvpermi.q  U9,     U10,   0x31 //4 6 5 7
+    xvpermi.d  U9,     U9,    0xd8 //4 5 6 7
+
+    xvst       U8,     C0,    0x00
+    xvst       U9,     C1,    0x00
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x05
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L18
+
+.L15:  /* for(i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x05
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    LD         a2,     A0,    0x08        //load2
+    LD         b2,     B0,    0x08        //load3
+    LD         b3,     B0,    0x10        //load4
+    LD         b4,     B0,    0x18        //load5
+
+    MADD1      c11,    a1,    b1,     c11  //res0
+    MADD2      c12,    a2,    b1,     c12  //res1
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a1,    b3,     c21  //res2
+    MADD2      c22,    a2,    b3,     c22  //res3
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+
+    MUL        b5,     c21,   ALPHA_R
+    MUL        b6,     c22,   ALPHA_I
+    SUB        b5,     b5,    b6
+    ST         b5,     C1,    0x00
+
+    MUL        b5,     c22,   ALPHA_R
+    MUL        b6,     c21,   ALPHA_I
+    ADD        b6,     b5,    b6
+    ST         b6,     C1,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+    LD         b5,     C1,    0x00    //C1[0]
+    LD         b6,     C1,    0x08    //C1[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    MADD       b5,     c21,   ALPHA_R, b5
+    MADD       b6,     c22,   ALPHA_R, b6
+    NMSUB      b5,     c22,   ALPHA_I, b5
+    MADD       b6,     c21,   ALPHA_I, b6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    ST         b5,     C1,    0x00
+    ST         b6,     C1,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x05
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L15
+
+.L18:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+
+    slli.d     L,      K,     0x05
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x02
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     1
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x05
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x04
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    LD         a2,     A0,    0x08        //load2
+    LD         b2,     B0,    0x08        //load3
+    LD         a3,     A0,    0x10        //load4
+    LD         a4,     A0,    0x18        //load5
+
+    MADD1      c11,    a1,    b1,     c11  //res0
+    MADD2      c12,    a2,    b1,     c12  //res1
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a3,    b1,     c21  //res2
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+
+    MUL        a7,     c21,   ALPHA_R
+    MUL        a8,     c22,   ALPHA_I
+    SUB        a7,     a7,    a8
+    ST         a7,     C0,    0x10
+
+    MUL        a7,     c22,   ALPHA_R
+    MUL        a8,     c21,   ALPHA_I
+    ADD        a8,     a7,    a8
+    ST         a8,     C0,    0x18
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+    LD         a7,     C0,    0x10    //C1[2]
+    LD         a8,     C0,    0x18    //C1[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    ST         a7,     C0,    0x10
+    ST         a8,     C0,    0x18
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x05
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x04
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x20
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L28
+
+.L25:  /* for (i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    LD         a2,     A0,    0x08        //load2
+    LD         b2,     B0,    0x08        //load3
+
+    MADD1      c11,    a1,    b1,     c11  //res0
+    MADD2      c12,    a2,    b1,     c12  //res1
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T1,    .L25
+
+.L28:
+    slli.d     L,      K,     4
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L20
+
+.L30:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_ncopy_2_lasx.S b/kernel/loongarch64/zgemm_ncopy_2_lasx.S
new file mode 100644
index 000000000..4bc1052a9
--- /dev/null
+++ b/kernel/loongarch64/zgemm_ncopy_2_lasx.S
@@ -0,0 +1,196 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+#define D8     $xr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x03  //lda
+    slli.d     TL,   TL,   0x01
+    slli.d     T0,   TL,   0x01
+
+    srai.d     I,    N,    0x01
+    beq        I,    ZERO,  .L_N0
+
+.L_J1: /* if (i > 0) I-- */
+    move       S1,   TS         //a_offset1
+    add.d      S2,   TS,   TL   //a_offset2
+    srai.d     J,    M,    0x02
+    add.d      TS,   TS,   T0
+
+    beq        J,    ZERO,  .L_I3
+
+.L_I1: /* if (j > 0) J-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x20
+    xvld       U2,   S2,   0x00
+    xvld       U3,   S2,   0x20
+
+    xvand.v    D0,   U0,   U0
+    xvand.v    D1,   U1,   U1
+    xvand.v    D2,   U2,   U2
+    xvand.v    D3,   U3,   U3
+
+    xvpermi.q  D0,   U2,   0x02
+    xvpermi.q  D2,   U0,   0x31
+    xvpermi.q  D1,   U3,   0x02
+    xvpermi.q  D3,   U1,   0x31
+
+    xvst       D0,   TD,   0x00
+    xvst       D2,   TD,   0x20
+    xvst       D1,   TD,   0x40
+    xvst       D3,   TD,   0x60
+
+    addi.d     S1,   S1,   0x40   // a_offset1
+    addi.d     S2,   S2,   0x40
+    addi.d     TD,   TD,   0x80  // b_offset
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_I1
+
+.L_I3:
+    andi       J,    M,    0x03
+    beq        J,    ZERO, .L_II20
+
+.L_II1:  /* j = (m & 3)  if (j > 0) */
+    vld        $vr0, S1,   0x00
+    vld        $vr1, S2,   0x00
+
+    vst        $vr0, TD,   0x00
+    vst        $vr1, TD,   0x10
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     TD,   TD,   0x20
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_II1
+
+.L_II20:
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_J1
+
+.L_N0:  /* if(n&1)*/
+    andi       I,     N,   0x01
+    beq        ZERO,  I,   .L_N00
+
+.L_N1:
+    srai.d     J,     M,   0x02
+    beq        ZERO,  J,   .L_N10
+
+.L_N11: /* j = (m >> 2) if (j > 0) */
+    xvld       U0,    TS,   0x00
+    xvld       U1,    TS,   0x20
+
+    xvst       U0,    TD,   0x00
+    xvst       U1,    TD,   0x20
+
+    addi.d     TS,    TS,   0x40   // a_offset
+    addi.d     TD,    TD,   0x40   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N11
+
+.L_N10:
+    andi       J,     M,    0x03
+    beq        J,     ZERO, .L_N00
+
+.L_N12:  /* j = (m & 3)  if (j > 0) */
+    vld        $vr0,  TS,   0x00
+    vst        $vr0,  TD,   0x00
+
+
+    addi.d     TS,    TS,   0x10   // a_offset
+    addi.d     TD,    TD,   0x10   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N12
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_tcopy_2_lasx.S b/kernel/loongarch64/zgemm_tcopy_2_lasx.S
new file mode 100644
index 000000000..3fe17beef
--- /dev/null
+++ b/kernel/loongarch64/zgemm_tcopy_2_lasx.S
@@ -0,0 +1,212 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r7
+#define T0     $r18
+#define S8     $r19
+#define S9     $r20
+#define S10    $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x03  //lda
+    slli.d     TL,     TL,    0x01
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x03
+    add.d      S9,     DST,   T0  //boffset2
+
+    srai.d     J,      M,     0x01  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01
+    add.d      S2,     S1,    TL   //aoffset2
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x40
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x20
+    xvld       U2,   S2,   0x00
+    xvld       U3,   S2,   0x20
+
+    xvst       U0,   S8,   0x00
+    xvst       U2,   S8,   0x20
+
+    slli.d     T0,   M,    0x05
+    add.d      S8,   S8,   T0
+
+    xvst       U1,   S8,   0x00
+    xvst       U3,   S8,   0x20
+
+    add.d      S8,   S8,   T0
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+
+    xvst       U0,   S8,   0x00
+    xvst       U1,   S8,   0x20
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    vld        $vr0, S1,   0x00
+    vld        $vr1, S2,   0x00
+
+    vst        $vr0, S9,   0x00
+    vst        $vr1, S9,   0x10
+
+    addi.d     S9,   S9,   0x20
+
+.L_J0:
+    addi.d     J,    J,    -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    xvld       U0,     TS,    0x00
+    xvld       U1,     TS,    0x20
+
+    xvst       U0,     TD,    0x00
+
+    slli.d     T0,     M,     0x05
+    add.d      TD,     TD,    T0
+
+    xvst       U1,     TD,    0x00
+
+    add.d      TD,     TD,    T0
+    addi.d     TS,     TS,    0x40
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    xvld       U0,     TS,    0x00
+
+    xvst       U0,     TD,    0x00
+
+    addi.d     TS,     TS,    0x20
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    vld        $vr0, TS,   0x00
+
+    vst        $vr0, S9,   0x00
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/znrm2_lasx.S b/kernel/loongarch64/znrm2_lasx.S
new file mode 100644
index 000000000..53f8a6e05
--- /dev/null
+++ b/kernel/loongarch64/znrm2_lasx.S
@@ -0,0 +1,252 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define XX     $r19
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+#define INF    $f23
+#define a1     $f22
+#define max    $f17
+#define ALPHA  $f12
+#define a3     $f15
+#define a2     $f16
+#define VX0    $xr15
+#define VX1    $xr16
+#define VM0    $xr17
+#define VM1    $xr18
+#define VM2    $xr13
+#define VM3    $xr14
+#define res1   $xr19
+#define res2   $xr20
+#define VALPHA $xr21
+
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    xvxor.v   res1, res1, res1
+    xvxor.v   res2, res2, res2
+    xvxor.v VM0, VM0, VM0
+    bge $r0, N, .L999
+    beq $r0, INCX, .L999
+    move  XX, X
+    // Init INF
+    addi.d  TEMP, $r0,  0x7FF
+    slli.d  TEMP, TEMP, 52
+    MTC  INF, TEMP
+    li.d  TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bge $r0,  I, .L97
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfmaxa.d VM1, VX1, VX0
+    xvfmaxa.d VM0, VM0, VM1
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    b .L96
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L97
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    b .L96
+    .align 3
+
+.L96:
+    xvpickve.d VX0, VM0, 1
+    xvpickve.d VX1, VM0, 2
+    xvpickve.d VM3, VM0, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM2, VM3, VM0
+    xvfmaxa.d VM0, VM1, VM2
+    .align 3
+
+.L97:
+    andi I, N, 3
+    bge $r0, I, .L99
+    .align 3
+
+.L98:
+    fld.d a3, X, 0 * SIZE
+    fld.d a2, X, 1 * SIZE
+    fmaxa.d a3, a2, a3
+    fmaxa.d max, a3, max
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L98
+    .align 3
+
+.L99:
+    fabs.d  max, max
+    lu12i.w  TEMP, 0x3f800 // 1
+    movgr2fr.d  a1, $r0
+    movgr2fr.w  ALPHA, TEMP
+    CMPEQ  $fcc0, max, a1
+    fcvt.d.s  ALPHA, ALPHA
+    bcnez  $fcc0, .L999
+    fdiv.d  ALPHA, ALPHA, max
+    CMPEQ  $fcc0, INF, ALPHA
+    bcnez  $fcc0, .L999
+    movfr2gr.d TEMP, ALPHA
+    xvreplgr2vr.d VALPHA, TEMP
+
+.L100:
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    srai.d  I, N, 2
+    bne INCX, TEMP, .L120
+    bge $r0, I, .L997
+    .align 3
+
+.L110:
+    xvld VX0, XX, 0 * SIZE
+    xvld VX1, XX, 4 * SIZE
+    xvfmul.d VM2, VX0, VALPHA
+    xvfmul.d VM3, VX1, VALPHA
+    xvfmadd.d res1, VM2, VM2, res1
+    xvfmadd.d res2, VM3, VM3, res2
+    addi.d XX, XX, 8 * SIZE
+    addi.d I, I, -1
+    blt $r0, I, .L110
+    b .L996
+    .align 3
+
+.L120:
+    bge $r0, I, .L997
+    .align 3
+
+.L121:
+    ld.d t1, XX, 0 * SIZE
+    ld.d t2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    ld.d t3, XX, 0 * SIZE
+    ld.d t4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, XX, 0 * SIZE
+    ld.d t2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    ld.d t3, XX, 0 * SIZE
+    ld.d t4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VM2, VX0, VALPHA
+    xvfmul.d VM3, VX1, VALPHA
+    xvfmadd.d res1, VM2, VM2, res1
+    xvfmadd.d res2, VM3, VM3, res2
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L996:
+    xvfadd.d res1, res1, res2
+    xvpickve.d VX0, res1, 1
+    xvpickve.d VX1, res1, 2
+    xvpickve.d VM2, res1, 3
+    xvfadd.d res1, VX0, res1
+    xvfadd.d VX1, VX1, VM2
+    xvfadd.d res1, VX1, res1
+    .align 3
+
+.L997:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.d a3, XX, 0 * SIZE
+    fld.d a2, XX, 1 * SIZE
+    addi.d I, I, -1
+    fmul.d a3, a3, ALPHA
+    fmadd.d $f19, a3, a3, $f19
+    fmul.d a2, a2, ALPHA
+    fmadd.d $f19, a2, a2, $f19
+    add.d XX, XX , INCX
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    fsqrt.d $f19, $f19
+    fmul.d $f0, max, $f19
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/znrm2_lsx.S b/kernel/loongarch64/znrm2_lsx.S
new file mode 100644
index 000000000..14c59d504
--- /dev/null
+++ b/kernel/loongarch64/znrm2_lsx.S
@@ -0,0 +1,260 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define XX     $r19
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+#define INF    $f23
+#define a1     $f22
+#define max    $f17
+#define ALPHA  $f12
+#define a3     $f15
+#define a2     $f16
+#define VX0    $vr15
+#define VX1    $vr16
+#define VM0    $vr17
+#define VM1    $vr18
+#define VM2    $vr13
+#define VM3    $vr14
+#define res1   $vr19
+#define res2   $vr20
+#define VALPHA $vr21
+
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    vxor.v   res1, res1, res1
+    vxor.v   res2, res2, res2
+    vxor.v VM0, VM0, VM0
+    bge $r0, N, .L999
+    beq $r0, INCX, .L999
+    move  XX, X
+    // Init INF
+    addi.d  TEMP, $r0,  0x7FF
+    slli.d  TEMP, TEMP, 52
+    MTC  INF, TEMP
+    li.d  TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bge $r0,  I, .L97
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmaxa.d VM1, VX1, VX0
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfmaxa.d VM2, VX1, VX0
+    vfmaxa.d VM3, VM1, VM2
+    vfmaxa.d VM0, VM0, VM3
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    b .L96
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L97
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM3, VM1, VM2
+    vfmaxa.d VM0, VM0, VM3
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    b .L96
+    .align 3
+
+.L96:
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    vfmaxa.d VM0, VX0, VX1
+    .align 3
+
+.L97:
+    andi I, N, 3
+    bge $r0, I, .L99
+    .align 3
+
+.L98:
+    fld.d a3, X, 0 * SIZE
+    fld.d a2, X, 1 * SIZE
+    fmaxa.d a3, a2, a3
+    fmaxa.d max, a3, max
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L98
+    .align 3
+
+.L99:
+    fabs.d  max, max
+    lu12i.w  TEMP, 0x3f800 // 1
+    movgr2fr.d  a1, $r0
+    movgr2fr.w  ALPHA, TEMP
+    CMPEQ  $fcc0, max, a1
+    fcvt.d.s  ALPHA, ALPHA
+    bcnez  $fcc0, .L999
+    fdiv.d  ALPHA, ALPHA, max
+    CMPEQ  $fcc0, INF, ALPHA
+    bcnez  $fcc0, .L999
+    movfr2gr.d TEMP, ALPHA
+    vreplgr2vr.d VALPHA, TEMP
+
+.L100:
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    srai.d  I, N, 2
+    bne INCX, TEMP, .L120
+    bge $r0, I, .L997
+    .align 3
+
+.L110:
+    vld VX0, XX, 0 * SIZE
+    vld VX1, XX, 2 * SIZE
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    vld VX0, XX, 4 * SIZE
+    vld VX1, XX, 6 * SIZE
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    addi.d XX, XX, 8 * SIZE
+    addi.d I, I, -1
+    blt $r0, I, .L110
+    b .L996
+    .align 3
+
+.L120:
+    bge $r0, I, .L997
+    .align 3
+
+.L121:
+    ld.d t1, XX, 0 * SIZE
+    ld.d t2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    ld.d t3, XX, 0 * SIZE
+    ld.d t4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmul.d VM2, VX0, VALPHA
+    ld.d t1, XX, 0 * SIZE
+    vfmul.d VM3, VX1, VALPHA
+    ld.d t2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    ld.d t3, XX, 0 * SIZE
+    ld.d t4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L996:
+    vfadd.d res1, res1, res2
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+    .align 3
+
+.L997:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.d a3, XX, 0 * SIZE
+    fld.d a2, XX, 1 * SIZE
+    addi.d I, I, -1
+    fmul.d a3, a3, ALPHA
+    fmadd.d $f19, a3, a3, $f19
+    fmul.d a2, a2, ALPHA
+    fmadd.d $f19, a2, a2, $f19
+    add.d XX, XX , INCX
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    fsqrt.d $f19, $f19
+    fmul.d $f0, max, $f19
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970
index a99fb7d96..fee5fa529 100644
--- a/kernel/power/KERNEL.PPC970
+++ b/kernel/power/KERNEL.PPC970
@@ -1,11 +1,11 @@
 ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
 SGEMMKERNEL    =  gemm_kernel.S
-SGEMMINCOPY    =  
-SGEMMITCOPY    =  
+SGEMMINCOPY    =
+SGEMMITCOPY    =
 SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ =  
-SGEMMITCOPYOBJ =  
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 else
diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4
index 1bdd3119e..c73601cee 100644
--- a/kernel/power/KERNEL.PPCG4
+++ b/kernel/power/KERNEL.PPCG4
@@ -96,9 +96,9 @@ CGEMMINCOPY    =
 CGEMMONCOPY    =
 CGEMMONCOPY    =  ../generic/zgemm_ncopy_2.c
 CGEMMOTCOPY    =  ../generic/zgemm_tcopy_2.c
-CGEMMINCOPYOBJ =  
+CGEMMINCOPYOBJ =
 #cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  
+CGEMMITCOPYOBJ =
 #cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/power/sgemm_tcopy_16_power8.S b/kernel/power/sgemm_tcopy_16_power8.S
index b9f6d63fb..6d2c6a555 100644
--- a/kernel/power/sgemm_tcopy_16_power8.S
+++ b/kernel/power/sgemm_tcopy_16_power8.S
@@ -108,6 +108,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define o0	0
 
+#ifdef POWER10
+#include "sgemm_tcopy_macros_16_power10.S"
+#endif
 #include "sgemm_tcopy_macros_16_power8.S"
 
 #define STACKSIZE 144
diff --git a/kernel/power/sgemm_tcopy_macros_16_power10.S b/kernel/power/sgemm_tcopy_macros_16_power10.S
new file mode 100644
index 000000000..dca37e48a
--- /dev/null
+++ b/kernel/power/sgemm_tcopy_macros_16_power10.S
@@ -0,0 +1,323 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_4x16', `
+#else
+.macro COPY_4x16
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+
+    lxvpx       vs36,   o0,     A1
+    lxvpx       vs38,   o32,    A1
+
+    lxvpx       vs40,   o0,     A2
+    lxvpx       vs42,   o32,    A2
+
+    lxvpx       vs44,   o0,     A3
+    lxvpx       vs46,   o32,    A3
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs32,   o0,     T1
+    stxvx       vs33,   o16,    T1
+    stxvx       vs34,   o32,    T1
+    stxvx       vs35,   o48,    T1
+#else
+    stxvx       vs33,   o0,     T1
+    stxvx       vs32,   o16,    T1
+    stxvx       vs35,   o32,    T1
+    stxvx       vs34,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs36,   o0,     T1
+    stxvx       vs37,   o16,    T1
+    stxvx       vs38,   o32,    T1
+    stxvx       vs39,   o48,    T1
+#else
+    stxvx       vs37,   o0,     T1
+    stxvx       vs36,   o16,    T1
+    stxvx       vs39,   o32,    T1
+    stxvx       vs38,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs40,   o0,     T1
+    stxvx       vs41,   o16,    T1
+    stxvx       vs42,   o32,    T1
+    stxvx       vs43,   o48,    T1
+#else
+    stxvx       vs41,   o0,     T1
+    stxvx       vs40,   o16,    T1
+    stxvx       vs43,   o32,    T1
+    stxvx       vs42,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs44,   o0,     T1
+    stxvx       vs45,   o16,    T1
+    stxvx       vs46,   o32,    T1
+    stxvx       vs47,   o48,    T1
+#else
+    stxvx       vs45,   o0,     T1
+    stxvx       vs44,   o16,    T1
+    stxvx       vs47,   o32,    T1
+    stxvx       vs46,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_4x8', `
+#else
+.macro COPY_4x8
+#endif
+
+    lxvpx       vs32,   o0,     A0
+
+    lxvpx       vs34,   o0,     A1
+
+    lxvpx       vs36,   o0,     A2
+
+    lxvpx       vs38,   o0,     A3
+
+    mr      T1, BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs32,   o0,     T1
+    stxvx       vs33,   o16,    T1
+
+    stxvx       vs34,   o32,    T1
+    stxvx       vs35,   o48,    T1
+#else
+    stxvx       vs33,   o0,     T1
+    stxvx       vs32,   o16,    T1
+
+    stxvx       vs35,   o32,    T1
+    stxvx       vs34,   o48,    T1
+#endif
+
+    addi        T1, T1, 64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs36,   o0,     T1
+    stxvx       vs37,   o16,    T1
+
+    stxvx       vs38,   o32,    T1
+    stxvx       vs39,   o48,    T1
+#else
+    stxvx       vs37,   o0,     T1
+    stxvx       vs36,   o16,    T1
+
+    stxvx       vs39,   o32,    T1
+    stxvx       vs38,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+/**********************************************************************************************
+* Macros for N=2 and M=16
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_2x16', `
+#else
+.macro COPY_2x16
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+
+    lxvpx       vs36,   o0,     A1
+    lxvpx       vs38,   o32,    A1
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs32,   o0,     T1
+    stxvx       vs33,   o16,    T1
+    stxvx       vs34,   o32,    T1
+    stxvx       vs35,   o48,    T1
+#else
+    stxvx       vs33,   o0,     T1
+    stxvx       vs32,   o16,    T1
+    stxvx       vs35,   o32,    T1
+    stxvx       vs34,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs36,   o0, T1
+    stxvx       vs37,   o16,    T1
+    stxvx       vs38,   o32,    T1
+    stxvx       vs39,   o48,    T1
+#else
+    stxvx       vs37,   o0, T1
+    stxvx       vs36,   o16,    T1
+    stxvx       vs39,   o32,    T1
+    stxvx       vs38,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_2x8', `
+#else
+.macro COPY_2x8
+#endif
+
+    lxvpx       vs32,   o0,     A0
+
+    lxvpx       vs34,   o0,     A1
+
+    mr      T1, BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs32,   o0,     T1
+    stxvx       vs33,   o16,    T1
+
+    stxvx       vs34,   o32,    T1
+    stxvx       vs35,   o48,    T1
+#else
+    stxvx       vs33,   o0,     T1
+    stxvx       vs32,   o16,    T1
+
+    stxvx       vs35,   o32,    T1
+    stxvx       vs34,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+/**********************************************************************************************
+* Macros for N=1 and M=16
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_1x16', `
+#else
+.macro COPY_1x16
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs32,   o0, T1
+    stxvx       vs33,   o16,    T1
+    stxvx       vs34,   o32,    T1
+    stxvx       vs35,   o48,    T1
+#else
+    stxvx       vs33,   o0, T1
+    stxvx       vs32,   o16,    T1
+    stxvx       vs35,   o32,    T1
+    stxvx       vs34,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_1x8', `
+#else
+.macro COPY_1x8
+#endif
+
+    lxvpx       vs32,   o0,     A0
+
+    mr      T1, BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs32,   o0, T1
+    stxvx       vs33,   o16,    T1
+#else
+    stxvx       vs33,   o0, T1
+    stxvx       vs32,   o16,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
diff --git a/kernel/power/sgemm_tcopy_macros_16_power8.S b/kernel/power/sgemm_tcopy_macros_16_power8.S
index ed592a604..af237d5ee 100644
--- a/kernel/power/sgemm_tcopy_macros_16_power8.S
+++ b/kernel/power/sgemm_tcopy_macros_16_power8.S
@@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=4 and M=16
 **********************************************************************************************/
 
+#ifndef POWER10
 #if defined(_AIX)
 define(`COPY_4x16', `
 #else
@@ -141,6 +142,7 @@ define(`COPY_4x8', `
 #else
 .endm
 #endif
+#endif
 
 /**********************************************************************************************
 * Macros for N=4 and M=4
@@ -264,6 +266,7 @@ define(`COPY_4x1', `
 * Macros for N=2 and M=16
 **********************************************************************************************/
 
+#ifndef POWER10
 #if defined(_AIX)
 define(`COPY_2x16', `
 #else
@@ -329,6 +332,7 @@ define(`COPY_2x8', `
 #else
 .endm
 #endif
+#endif
 
 /**********************************************************************************************
 * Macros for N=2 and M=4
@@ -418,6 +422,7 @@ define(`COPY_2x1', `
 * Macros for N=1 and M=16
 **********************************************************************************************/
 
+#ifndef POWER10
 #if defined(_AIX)
 define(`COPY_1x16', `
 #else
@@ -465,6 +470,7 @@ define(`COPY_1x8', `
 #else
 .endm
 #endif
+#endif
 
 /**********************************************************************************************
 * Macros for N=1 and M=4
diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL
index bea7036c2..f8278c3b4 100644
--- a/kernel/x86_64/KERNEL
+++ b/kernel/x86_64/KERNEL
@@ -405,7 +405,7 @@ DGEMVNKERNEL = dgemv_n.S
 endif
 
 ifndef DGEMVTKERNEL
-DGEMVTKERNEL = dgemv_t.S
+DGEMVTKERNEL = dgemv_t_4.c
 endif
 
 ifndef CGEMVNKERNEL
diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c
index e4d054311..28d78d279 100644
--- a/kernel/x86_64/casum.c
+++ b/kernel/x86_64/casum.c
@@ -9,12 +9,12 @@
 #endif
 
 #ifndef HAVE_CASUM_KERNEL
-static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
+static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
 {
 
     BLASLONG i=0;
     BLASLONG n_8 = n & -8;
-    FLOAT *x = x1;
+    FLOAT *x1 = x;
     FLOAT temp0, temp1, temp2, temp3;
     FLOAT temp4, temp5, temp6, temp7;
     FLOAT sum0 = 0.0;
@@ -24,14 +24,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
     FLOAT sum4 = 0.0;
     
     while (i < n_8) {
-        temp0 = ABS_K(x[0]);
-        temp1 = ABS_K(x[1]);
-        temp2 = ABS_K(x[2]);
-        temp3 = ABS_K(x[3]);
-        temp4 = ABS_K(x[4]);
-        temp5 = ABS_K(x[5]);
-        temp6 = ABS_K(x[6]);
-        temp7 = ABS_K(x[7]);
+        temp0 = ABS_K(x1[0]);
+        temp1 = ABS_K(x1[1]);
+        temp2 = ABS_K(x1[2]);
+        temp3 = ABS_K(x1[3]);
+        temp4 = ABS_K(x1[4]);
+        temp5 = ABS_K(x1[5]);
+        temp6 = ABS_K(x1[6]);
+        temp7 = ABS_K(x1[7]);
         
         sum0 += temp0;
         sum1 += temp1;
@@ -43,12 +43,12 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
         sum2 += temp6;
         sum3 += temp7;
         
-        x+=8;
+        x1+=8;
         i+=4;
     }
 
      while (i < n) {
-        sum4 += (ABS_K(x1[0]) + ABS_K(x1[1]));
+        sum4 += ABS_K(x1[0]) + ABS_K(x1[1]);
         x1 += 2;
         i++;
      }
diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c
index d261962de..10b70ff20 100644
--- a/kernel/x86_64/casum_microk_skylakex-2.c
+++ b/kernel/x86_64/casum_microk_skylakex-2.c
@@ -2,9 +2,9 @@
 #ifdef __NVCOMPILER
 #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
 #endif
-#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && (__clang_major__ >= 9 &&__clang_major__ !=17)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
+#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))
 
-#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309))
+#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))
 
 #define HAVE_CASUM_KERNEL 1
 
@@ -20,15 +20,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
     
     if (n2 < 64) {
         __m128 accum_10, accum_11, accum_12, accum_13;
-        __m128 abs_mask1 = abs_mask1;
+        __m128 abs_mask1;
 
         accum_10 = _mm_setzero_ps();
         accum_11 = _mm_setzero_ps();
         accum_12 = _mm_setzero_ps();
         accum_13 = _mm_setzero_ps();
         
-        abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
-        abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1);
+        abs_mask1 = (__m128)_mm_set1_epi32(0x7fffffff);
                 
         _mm_prefetch(&x1[0], _MM_HINT_T0);
         
diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c
index dddf03fe2..f6bc8e37b 100644
--- a/kernel/x86_64/zasum_microk_skylakex-2.c
+++ b/kernel/x86_64/zasum_microk_skylakex-2.c
@@ -2,9 +2,9 @@
 #ifdef __NVCOMPILER
 #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
 #endif
-#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && ( __clang_major__ >= 9 && __clang_major__ != 17)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
+#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))
 
-#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309))
+#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))
 
 #define HAVE_ZASUM_KERNEL 1
 
@@ -21,16 +21,14 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
 
     if (n2 < 32) {
         __m128d accum_10, accum_11, accum_12, accum_13;
-        __m128d abs_mask1 = abs_mask1;
+        __m128d abs_mask1;
 
         accum_10 = _mm_setzero_pd();
         accum_11 = _mm_setzero_pd();
         accum_12 = _mm_setzero_pd();
         accum_13 = _mm_setzero_pd();
         
-        // abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
-        abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
-        abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1);
+        abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
                 
         _mm_prefetch(&x1[0], _MM_HINT_T0);
         if (n2 >= 16){
diff --git a/lapack-netlib/CMakeLists.txt b/lapack-netlib/CMakeLists.txt
index fefaa8b89..f1f47ae24 100644
--- a/lapack-netlib/CMakeLists.txt
+++ b/lapack-netlib/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.2)
+cmake_minimum_required(VERSION 3.6)
 
-project(LAPACK Fortran C)
+project(LAPACK)
 
 set(LAPACK_MAJOR_VERSION 3)
-set(LAPACK_MINOR_VERSION 11)
+set(LAPACK_MINOR_VERSION 12)
 set(LAPACK_PATCH_VERSION 0)
 set(
   LAPACK_VERSION
@@ -45,6 +45,14 @@ if(_is_coverage_build)
   find_package(codecov)
 endif()
 
+# Use valgrind if it is found
+option( LAPACK_TESTING_USE_PYTHON "Use Python for testing. Disable it on memory checks." ON )
+find_program( MEMORYCHECK_COMMAND valgrind )
+if( MEMORYCHECK_COMMAND )
+  message( STATUS "Found valgrind: ${MEMORYCHECK_COMMAND}" )
+  set( MEMORYCHECK_COMMAND_OPTIONS "--leak-check=full --show-leak-kinds=all --track-origins=yes" )
+endif()
+
 # By default test Fortran compiler complex abs and complex division
 option(TEST_FORTRAN_COMPILER "Test Fortran compiler complex abs and complex division" OFF)
 if( TEST_FORTRAN_COMPILER )
@@ -76,7 +84,7 @@ if( TEST_FORTRAN_COMPILER )
     WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL
     COMMENT "Running test_zminMax in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zminMax.err"
     SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zminMax.f )
-    
+
 endif()
 
 # By default static library
@@ -99,6 +107,8 @@ else()
   set(LAPACKELIB "lapacke")
   set(TMGLIB "tmglib")
 endif()
+# By default build standard API and extended _64 API
+option(BUILD_INDEX64_EXT_API "Build Index-64 API as extended API with _64 suffix" ON)
 
 include(GNUInstallDirs)
 
@@ -127,90 +137,6 @@ configure_file(
 include(PreventInSourceBuilds)
 include(PreventInBuildInstalls)
 
-# Check if recursive flag exists
-include(CheckFortranCompilerFlag)
-if(CMAKE_Fortran_COMPILER_ID STREQUAL Flang)
-  check_fortran_compiler_flag("-Mrecursive" _MrecursiveFlag)
-elseif(CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
-  check_fortran_compiler_flag("-frecursive" _frecursiveFlag)
-elseif(CMAKE_Fortran_COMPILER_ID STREQUAL Intel)
-  check_fortran_compiler_flag("-recursive" _recursiveFlag)
-elseif(CMAKE_Fortran_COMPILER_ID STREQUAL XL)
-  check_fortran_compiler_flag("-qrecur" _qrecurFlag)
-elseif(CMAKE_Fortran_COMPILER_ID STREQUAL NAG)
-  check_fortran_compiler_flag("-recursive" _recursiveFlag)
-else()
-  message(WARNING "Fortran local arrays should be allocated on the stack."
-    " Please use a compiler which guarantees that feature."
-    " See https://github.com/Reference-LAPACK/lapack/pull/188 and references therein.")
-endif()
-
-# Add recursive flag
-if(_MrecursiveFlag)
-  string(REGEX MATCH "-Mrecursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
-  if(NOT output_test)
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mrecursive"
-      CACHE STRING "Recursive flag must be set" FORCE)
-  endif()
-elseif(_frecursiveFlag)
-  string(REGEX MATCH "-frecursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
-  if(NOT output_test)
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -frecursive"
-      CACHE STRING "Recursive flag must be set" FORCE)
-  endif()
-elseif(_recursiveFlag)
-  string(REGEX MATCH "-recursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
-  if(NOT output_test)
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -recursive"
-      CACHE STRING "Recursive flag must be set" FORCE)
-  endif()
-elseif(_qrecurFlag)
-  string(REGEX MATCH "-qrecur" output_test <string> "${CMAKE_Fortran_FLAGS}")
-  if(NOT output_test)
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qrecur"
-      CACHE STRING "Recursive flag must be set" FORCE)
-  endif()
-endif()
-
-if(UNIX)
-  if(CMAKE_Fortran_COMPILER_ID STREQUAL Intel)
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict")
-  endif()
-  if(CMAKE_Fortran_COMPILER_ID STREQUAL XL)
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict")
-  endif()
-# Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler.
-# This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin
-  string(REPLACE \;mtsk\; \; CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES "${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES}")
-endif()
-
-if(CMAKE_Fortran_COMPILER_ID STREQUAL Compaq)
-  if(WIN32)
-    if(CMAKE_GENERATOR STREQUAL "NMake Makefiles")
-      get_filename_component(CMAKE_Fortran_COMPILER_CMDNAM ${CMAKE_Fortran_COMPILER} NAME_WE)
-      message(STATUS "Using Compaq Fortran compiler with command name ${CMAKE_Fortran_COMPILER_CMDNAM}")
-      set(cmd ${CMAKE_Fortran_COMPILER_CMDNAM})
-      string(TOLOWER "${cmd}" cmdlc)
-      if(cmdlc STREQUAL "df")
-        message(STATUS "Assume the Compaq Visual Fortran Compiler is being used")
-        set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
-        set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_INCLUDES 1)
-        #This is a workaround that is needed to avoid forward-slashes in the
-        #filenames listed in response files from incorrectly being interpreted as
-        #introducing compiler command options
-        if(${BUILD_SHARED_LIBS})
-          message(FATAL_ERROR "Making of shared libraries with CVF has not been tested.")
-        endif()
-        set(str "NMake version 9 or later should be used. NMake version 6.0 which is\n")
-        set(str "${str}   included with the CVF distribution fails to build Lapack because\n")
-        set(str "${str}   the number of source files exceeds the limit for NMake v6.0\n")
-        message(STATUS ${str})
-        set(CMAKE_Fortran_LINK_EXECUTABLE "LINK /out:<TARGET> <LINK_FLAGS> <LINK_LIBRARIES> <OBJECTS>")
-      endif()
-    endif()
-  endif()
-endif()
-
 # Add option to enable flat namespace for symbol resolution on macOS
 if(APPLE)
   option(USE_FLAT_NAMESPACE "Use flat namespaces for symbol resolution during build and runtime." OFF)
@@ -268,26 +194,6 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/bin)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/lib)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/lib)
 
-# --------------------------------------------------
-# Check for any necessary platform specific compiler flags
-include(CheckLAPACKCompilerFlags)
-CheckLAPACKCompilerFlags()
-
-# --------------------------------------------------
-# Check second function
-
-include(CheckTimeFunction)
-set(TIME_FUNC NONE)
-CHECK_TIME_FUNCTION(NONE TIME_FUNC)
-CHECK_TIME_FUNCTION(INT_CPU_TIME TIME_FUNC)
-CHECK_TIME_FUNCTION(EXT_ETIME TIME_FUNC)
-CHECK_TIME_FUNCTION(EXT_ETIME_ TIME_FUNC)
-CHECK_TIME_FUNCTION(INT_ETIME TIME_FUNC)
-message(STATUS "--> Will use second_${TIME_FUNC}.f and dsecnd_${TIME_FUNC}.f as timing function.")
-
-set(SECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/second_${TIME_FUNC}.f)
-set(DSECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/dsecnd_${TIME_FUNC}.f)
-
 # deprecated LAPACK and LAPACKE routines
 option(BUILD_DEPRECATED "Build deprecated routines" OFF)
 message(STATUS "Build deprecated routines: ${BUILD_DEPRECATED}")
@@ -380,18 +286,27 @@ endif()
 
 # Check the usage of the user provided or automatically found LAPACK libraries
 if(LAPACK_LIBRARIES)
-  include(CheckFortranFunctionExists)
-  set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES})
-  # Check if new routine of 3.4.0 is in LAPACK_LIBRARIES
-  CHECK_FORTRAN_FUNCTION_EXISTS("dgeqrt" LATESTLAPACK_FOUND)
-  unset(CMAKE_REQUIRED_LIBRARIES)
-  if(LATESTLAPACK_FOUND)
-    message(STATUS "--> LAPACK supplied by user is WORKING, will use ${LAPACK_LIBRARIES}.")
+  include(CheckLanguage)
+  check_language(Fortran)
+  if(CMAKE_Fortran_COMPILER)
+    enable_language(Fortran)
+    include(CheckFortranFunctionExists)
+    set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES})
+    # Check if new routine of 3.4.0 is in LAPACK_LIBRARIES
+    CHECK_FORTRAN_FUNCTION_EXISTS("dgeqrt" LATESTLAPACK_FOUND)
+    unset(CMAKE_REQUIRED_LIBRARIES)
+    if(LATESTLAPACK_FOUND)
+      message(STATUS "--> LAPACK supplied by user is WORKING, will use ${LAPACK_LIBRARIES}.")
+    else()
+      message(ERROR "--> LAPACK supplied by user is not WORKING or is older than LAPACK 3.4.0, CANNOT USE ${LAPACK_LIBRARIES}.")
+      message(ERROR "-->     Will use REFERENCE LAPACK (by default)")
+      message(ERROR "-->     Or Correct your LAPACK_LIBRARIES entry ")
+      message(ERROR "-->     Or Consider checking USE_OPTIMIZED_LAPACK")
+    endif()
   else()
-    message(ERROR "--> LAPACK supplied by user is not WORKING or is older than LAPACK 3.4.0, CANNOT USE ${LAPACK_LIBRARIES}.")
-    message(ERROR "-->     Will use REFERENCE LAPACK (by default)")
-    message(ERROR "-->     Or Correct your LAPACK_LIBRARIES entry ")
-    message(ERROR "-->     Or Consider checking USE_OPTIMIZED_LAPACK")
+    message(STATUS "--> LAPACK supplied by user is ${LAPACK_LIBRARIES}.")
+    message(STATUS "--> CMake couldn't find a Fortran compiler, so it cannot check if the provided LAPACK library works.")
+    set(LATESTLAPACK_FOUND TRUE)
   endif()
 endif()
 
@@ -399,6 +314,27 @@ endif()
 if(NOT LATESTLAPACK_FOUND)
   message(STATUS "Using supplied NETLIB LAPACK implementation")
   set(LAPACK_LIBRARIES ${LAPACKLIB})
+
+  enable_language(Fortran)
+
+  # Check for any necessary platform specific compiler flags
+  include(CheckLAPACKCompilerFlags)
+  CheckLAPACKCompilerFlags()
+
+  # Check second function
+  include(CheckTimeFunction)
+  set(TIME_FUNC NONE)
+  CHECK_TIME_FUNCTION(NONE TIME_FUNC)
+  CHECK_TIME_FUNCTION(INT_CPU_TIME TIME_FUNC)
+  CHECK_TIME_FUNCTION(EXT_ETIME TIME_FUNC)
+  CHECK_TIME_FUNCTION(EXT_ETIME_ TIME_FUNC)
+  CHECK_TIME_FUNCTION(INT_ETIME TIME_FUNC)
+
+  # Set second function
+  message(STATUS "--> Will use second_${TIME_FUNC}.f and dsecnd_${TIME_FUNC}.f as timing function.")
+  set(SECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/second_${TIME_FUNC}.f)
+  set(DSECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/dsecnd_${TIME_FUNC}.f)
+
   add_subdirectory(SRC)
 else()
   set(CMAKE_EXE_LINKER_FLAGS
@@ -431,9 +367,11 @@ endif()
 # Cache export target
 set(LAPACK_INSTALL_EXPORT_NAME_CACHE ${LAPACK_INSTALL_EXPORT_NAME})
 if(BUILD_TESTING OR LAPACKE_WITH_TMG)
+  enable_language(Fortran)
   if(LATESTLAPACK_FOUND AND LAPACKE_WITH_TMG)
     set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES})
     # Check if dlatms (part of tmg) is found
+    include(CheckFortranFunctionExists)
     CHECK_FORTRAN_FUNCTION_EXISTS("dlatms" LAPACK_WITH_TMGLIB_FOUND)
     unset(CMAKE_REQUIRED_LIBRARIES)
     if(NOT LAPACK_WITH_TMGLIB_FOUND)
@@ -448,6 +386,12 @@ endif()
 set(LAPACK_INSTALL_EXPORT_NAME ${LAPACK_INSTALL_EXPORT_NAME_CACHE})
 unset(LAPACK_INSTALL_EXPORT_NAME_CACHE)
 
+
+#-------------------------------------
+# LAPACKE
+# Include lapack.h and lapacke_mangling.h even if LAPACKE is not built
+add_subdirectory(LAPACKE/include)
+
 if(LAPACKE)
   add_subdirectory(LAPACKE)
 endif()
@@ -474,8 +418,8 @@ if (BLAS++)
   ExternalProject_Add(blaspp
 	  URL https://bitbucket.org/icl/blaspp/downloads/blaspp-2020.10.02.tar.gz
 	  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/blaspp-prefix/src/blaspp
-	  BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make
-	  INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install
+	  BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build .
+	  INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install .
 	  )
   ExternalProject_Add_StepDependencies(blaspp build ${BLAS_LIBRARIES})
 endif()
@@ -487,16 +431,16 @@ if (LAPACK++)
   ExternalProject_Add(lapackpp
 	  URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz
 	  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES=${LAPACK_LIBRARIES} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}  ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp
-	  BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make
-	  INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install
+	  BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build .
+	  INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install .
 	  )
   else ()
 # FIXME this does not really work as the libraries list gets converted to a semicolon-separated list somewhere in the lapack++ build files
   ExternalProject_Add(lapackpp
 	  URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz
 	  CONFIGURE_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES="${PROJECT_BINARY_DIR}/lib/liblapack.a -lgfortran"  -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}  ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp
-	  BUILD_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make
-	  INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install
+	  BUILD_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build .
+	  INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install .
   )
   endif()
   ExternalProject_Add_StepDependencies(lapackpp build blaspp ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
@@ -671,22 +615,34 @@ if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION)
     set(DOXYGEN_PROJECT_BRIEF "LAPACK: Linear Algebra PACKage")
     set(DOXYGEN_PROJECT_NUMBER ${LAPACK_VERSION})
     set(DOXYGEN_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/DOCS)
-    set(PROJECT_LOGO ${CMAKE_CURRENT_SOURCE_DIR}/DOCS/lapack.png)
+    set(DOXYGEN_PROJECT_LOGO ${CMAKE_CURRENT_SOURCE_DIR}/DOCS/lapack.png)
     set(DOXYGEN_OPTIMIZE_FOR_FORTRAN YES)
     set(DOXYGEN_SOURCE_BROWSER YES)
-    set(DISTRIBUTE_GROUP_DOC YES)
     set(DOXYGEN_CREATE_SUBDIRS YES)
     set(DOXYGEN_SEPARATE_MEMBER_PAGES YES)
     set(DOXYGEN_EXTRACT_ALL YES)
-    set(DOXYGEN_FILE_PATTERNS "*.f;*.c;*.h")
+    set(DOXYGEN_FILE_PATTERNS *.f *.f90 *.c *.h )
     set(DOXYGEN_RECURSIVE YES)
     set(DOXYGEN_GENERATE_TREEVIEW YES)
+    set(DOXYGEN_DOT_IMAGE_FORMAT svg)
     set(DOXYGEN_INTERACTIVE_SVG YES)
-    set(DOXYGEN_QUIET YES)
-    set(DOXYGEN_WARNINGS NO)
+    set(DOXYGEN_QUIET NO)
+    set(DOXYGEN_WARNINGS YES)
+    set(DOXYGEN_WARN_NO_PARAMDOC YES)
+    set(DOXYGEN_WARN_LOGFILE doxygen_error)
     set(DOXYGEN_GENERATE_HTML NO)
     set(DOXYGEN_GENERATE_MAN NO)
-
+    set(DOXYGEN_LAYOUT_FILE "DOCS/DoxygenLayout.xml")
+
+    # Exclude functions that are duplicated, creating conflicts.
+    set(DOXYGEN_EXCLUDE .git
+                        .github
+                        SRC/VARIANTS
+                        BLAS/SRC/lsame.f
+                        BLAS/SRC/xerbla.f
+                        BLAS/SRC/xerbla_array.f
+                        INSTALL/slamchf77.f
+                        INSTALL/dlamchf77.f )
 
     if (BUILD_HTML_DOCUMENTATION)
         set(DOXYGEN_GENERATE_HTML YES)
@@ -697,13 +653,21 @@ if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION)
 
         doxygen_add_docs(
             html
-            ${PROJECT_SOURCE_DIR}
+
+            # Doxygen INPUT =
+            ${PROJECT_SOURCE_DIR}/README.md
+            ${PROJECT_SOURCE_DIR}/BLAS
+            ${PROJECT_SOURCE_DIR}/CBLAS
+            ${PROJECT_SOURCE_DIR}/SRC
+            ${PROJECT_SOURCE_DIR}/INSTALL
+            ${PROJECT_SOURCE_DIR}/TESTING
+            ${PROJECT_SOURCE_DIR}/DOCS/groups-usr.dox
+
             COMMENT "Generating html LAPACK documentation (it will take some time... time to grab a coffee)"
         )
     endif()
     if (BUILD_MAN_DOCUMENTATION)
         set(DOXYGEN_GENERATE_MAN YES)
-        set(DOXYGEN_EXCLUDE SRC/VARIANTS)
         set(DOXYGEN_MAN_LINKS YES)
         set(DOXYGEN_INLINE_SOURCES NO)
         set(DOXYGEN_CALL_GRAPH NO)
@@ -711,7 +675,15 @@ if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION)
 
         doxygen_add_docs(
             man
-            ${PROJECT_SOURCE_DIR}
+
+            # Doxygen INPUT =
+            ${PROJECT_SOURCE_DIR}/BLAS
+            ${PROJECT_SOURCE_DIR}/CBLAS
+            ${PROJECT_SOURCE_DIR}/SRC
+            ${PROJECT_SOURCE_DIR}/INSTALL
+            ${PROJECT_SOURCE_DIR}/TESTING
+            ${PROJECT_SOURCE_DIR}/DOCS/groups-usr.dox
+
             COMMENT "Generating man LAPACK documentation"
         )
     endif()
diff --git a/lapack-netlib/DOCS/Doxyfile b/lapack-netlib/DOCS/Doxyfile
index 43cea43b5..577675772 100644
--- a/lapack-netlib/DOCS/Doxyfile
+++ b/lapack-netlib/DOCS/Doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.10
+# Doxyfile 1.9.1
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -17,11 +17,11 @@
 # Project related configuration options
 #---------------------------------------------------------------------------
 
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
 # The default value is: UTF-8.
 
 DOXYFILE_ENCODING      = UTF-8
@@ -38,7 +38,7 @@ PROJECT_NAME           = LAPACK
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 3.9.0
+PROJECT_NUMBER         = 3.12.0
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -93,6 +93,14 @@ ALLOW_UNICODE_NAMES    = NO
 
 OUTPUT_LANGUAGE        = English
 
+# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all generated output in the proper direction.
+# Possible values are: None, LTR, RTL and Context.
+# The default value is: None.
+
+OUTPUT_TEXT_DIRECTION  = None
+
 # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
 # descriptions after the members that are listed in the file and class
 # documentation (similar to Javadoc). Set to NO to disable this.
@@ -179,6 +187,16 @@ SHORT_NAMES            = NO
 
 JAVADOC_AUTOBRIEF      = NO
 
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
 # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
 # line (until the first dot) of a Qt-style comment as the brief description. If
 # set to NO, the Qt-style will behave just like regular Qt-style comments (thus
@@ -199,6 +217,14 @@ QT_AUTOBRIEF           = NO
 
 MULTILINE_CPP_IS_BRIEF = NO
 
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
 # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
 # documentation from any documented member that it re-implements.
 # The default value is: YES.
@@ -226,16 +252,15 @@ TAB_SIZE               = 8
 # will allow you to put the command \sideeffect (or @sideeffect) in the
 # documentation, which will result in a user-defined paragraph with heading
 # "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+# When you need a literal { or } or , in the value part of an alias you have to
+# escape them by means of a backslash (\), this can lead to conflicts with the
+# commands \{ and \} for these it is advised to use the version @{ and @} or use
+# a double escape (\\{ and \\})
 
 ALIASES                =
 
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
 # instance, some of the names that are used will be different. The list of all
@@ -264,28 +289,40 @@ OPTIMIZE_FOR_FORTRAN   = YES
 
 OPTIMIZE_OUTPUT_VHDL   = NO
 
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
 # Doxygen selects the parser to use depending on the extension of the files it
 # parses. With this tag you can assign which parser to use for a given
 # extension. Doxygen has a built-in mapping, but you can override or extend it
 # using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL,
+# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
 #
 # Note: For files without extension you can use no_extension as a placeholder.
 #
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
 
 EXTENSION_MAPPING      =
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
 # The output of markdown processing is further processed by doxygen, so you can
 # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
 # case of backward compatibilities issues.
@@ -293,6 +330,15 @@ EXTENSION_MAPPING      =
 
 MARKDOWN_SUPPORT       = YES
 
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 5
+
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
 # be prevented in individual cases by putting a % sign in front of the word or
@@ -318,7 +364,7 @@ BUILTIN_STL_SUPPORT    = NO
 CPP_CLI_SUPPORT        = NO
 
 # Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
 # will parse them like normal C++ but will assume all classes use public instead
 # of private inheritance when no explicit protection keyword is present.
 # The default value is: NO.
@@ -341,7 +387,7 @@ IDL_PROPERTY_SUPPORT   = YES
 # all members of a group must be documented explicitly.
 # The default value is: NO.
 
-DISTRIBUTE_GROUP_DOC   = YES
+DISTRIBUTE_GROUP_DOC   = NO
 
 # If one adds a struct or class to a group and this option is enabled, then also
 # any nested class or struct is added to the same group. By default this option
@@ -404,6 +450,19 @@ TYPEDEF_HIDES_STRUCT   = NO
 
 LOOKUP_CACHE_SIZE      = 0
 
+# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which efficively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
@@ -424,6 +483,12 @@ EXTRACT_ALL            = YES
 
 EXTRACT_PRIVATE        = NO
 
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
 # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
 # scope will be included in the documentation.
 # The default value is: NO.
@@ -461,6 +526,13 @@ EXTRACT_LOCAL_METHODS  = NO
 
 EXTRACT_ANON_NSPACES   = NO
 
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
 # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
 # undocumented members inside documented classes or files. If set to NO these
 # members will be included in the various overviews, but no documentation
@@ -478,8 +550,8 @@ HIDE_UNDOC_MEMBERS     = NO
 HIDE_UNDOC_CLASSES     = NO
 
 # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
 # The default value is: NO.
 
 HIDE_FRIEND_COMPOUNDS  = NO
@@ -498,11 +570,18 @@ HIDE_IN_BODY_DOCS      = NO
 
 INTERNAL_DOCS          = NO
 
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
 # The default value is: system dependent.
 
 CASE_SENSE_NAMES       = NO
@@ -684,12 +763,12 @@ FILE_VERSION_FILTER    =
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
 # tag is left empty.
 
-LAYOUT_FILE            =
+LAYOUT_FILE            = DOCS/DoxygenLayout.xml
 
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
 # extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
 # For LaTeX the style of the bibliography can be controlled using
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.
@@ -705,7 +784,7 @@ CITE_BIB_FILES         =
 # messages are off.
 # The default value is: NO.
 
-QUIET                  = YES
+QUIET                  = NO
 
 # The WARNINGS tag can be used to turn on/off the warning messages that are
 # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
@@ -734,10 +813,20 @@ WARN_IF_DOC_ERROR      = YES
 # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
 # are documented, but have no documentation for their parameters or return
 # value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
+# parameter documentation, but not about the absence of documentation. If
+# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
 # The default value is: NO.
 
-WARN_NO_PARAMDOC       = NO
+WARN_NO_PARAMDOC       = YES
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
 
 # The WARN_FORMAT tag determines the format of the warning messages that doxygen
 # can produce. The string should contain the $file, $line, and $text tags, which
@@ -753,7 +842,7 @@ WARN_FORMAT            = "$file:$line: $text"
 # messages should be written. If left blank the output is written to standard
 # error (stderr).
 
-WARN_LOGFILE           = output_err
+WARN_LOGFILE           = doxygen_error
 
 #---------------------------------------------------------------------------
 # Configuration options related to the input files
@@ -762,17 +851,18 @@ WARN_LOGFILE           = output_err
 # The INPUT tag is used to specify the files and/or directories that contain
 # documented source files. You may enter file names like myfile.cpp or
 # directories like /usr/src/myproject. Separate the files or directories with
-# spaces.
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = . \
-                         DOCS/groups-usr.dox
+INPUT                  = BLAS CBLAS SRC INSTALL TESTING \
+                         DOCS/groups-usr.dox \
+                         README.md
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
 # The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
@@ -785,14 +875,19 @@ INPUT_ENCODING         = UTF-8
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # read by doxygen.
 #
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
 # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
 # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd,
-# *.vhdl, *.ucf, *.qsf, *.as and *.js.
+# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment),
+# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl,
+# *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          = *.c \
                          *.f \
+                         *.f90 \
                          *.h
 
 # The RECURSIVE tag can be used to specify whether or not subdirectories should
@@ -808,34 +903,15 @@ RECURSIVE              = YES
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = CMAKE \
-                         DOCS \
-                         .svn \
-                         CBLAS/.svn \
-                         CBLAS/src/.svn \
-                         CBLAS/testing/.svn \
-                         CBLAS/example/.svn \
-                         CBLAS/include/.svn \
-                         BLAS/.svn \
-                         BLAS/SRC/.svn \
-                         BLAS/TESTING/.svn \
-                         SRC/.svn \
-                         SRC/VARIANTS/.svn \
-                         SRC/VARIANTS/LIB/.svn \
-                         SRC/VARIANTS/cholesky/.svn \
-                         SRC/VARIANTS/cholesky/RL/.svn \
-                         SRC/VARIANTS/cholesky/TOP/.svn \
-                         SRC/VARIANTS/lu/.svn \
-                         SRC/VARIANTS/lu/CR/.svn \
-                         SRC/VARIANTS/lu/LL/.svn \
-                         SRC/VARIANTS/lu/REC/.svn \
-                         SRC/VARIANTS/qr/.svn \
-                         SRC/VARIANTS/qr/LL/.svn \
-                         INSTALL/.svn \
-                         TESTING/.svn \
-                         TESTING/EIG/.svn \
-                         TESTING/MATGEN/.svn \
-                         TESTING/LIN/.svn
+# Exclude functions that are duplicated, creating conflicts.
+EXCLUDE                = .git \
+                         .github \
+                         SRC/VARIANTS \
+                         BLAS/SRC/lsame.f \
+                         BLAS/SRC/xerbla.f \
+                         BLAS/SRC/xerbla_array.f \
+                         INSTALL/slamchf77.f \
+                         INSTALL/dlamchf77.f \
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -908,6 +984,10 @@ IMAGE_PATH             =
 # Note that the filter must not add or remove lines; it is applied before the
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 INPUT_FILTER           =
 
@@ -917,6 +997,10 @@ INPUT_FILTER           =
 # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
 # filters are used. If the FILTER_PATTERNS tag is empty or if none of the
 # patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 FILTER_PATTERNS        =
 
@@ -969,7 +1053,7 @@ INLINE_SOURCES         = YES
 STRIP_CODE_COMMENTS    = YES
 
 # If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
+# entity all documented functions referencing it will be listed.
 # The default value is: NO.
 
 REFERENCED_BY_RELATION = NO
@@ -1001,12 +1085,12 @@ SOURCE_TOOLTIPS        = YES
 # If the USE_HTAGS tag is set to YES then the references to source code will
 # point to the HTML generated by the htags(1) tool instead of doxygen built-in
 # source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
+# (see https://www.gnu.org/software/global/global.html). You will need version
 # 4.8.6 or higher.
 #
 # To use it do the following:
 # - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
 # - Make sure the INPUT points to the root of the source tree
 # - Run doxygen as normal
 #
@@ -1028,25 +1112,6 @@ USE_HTAGS              = NO
 
 VERBATIM_HEADERS       = YES
 
-# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
-# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
-# cost of reduced performance. This can be particularly helpful with template
-# rich C++ code for which doxygen's built-in parser lacks the necessary type
-# information.
-# Note: The availability of this option depends on whether or not doxygen was
-# compiled with the --with-libclang option.
-# The default value is: NO.
-
-CLANG_ASSISTED_PARSING = NO
-
-# If clang assisted parsing is enabled you can provide the compiler with command
-# line options that you would normally use when invoking the compiler. Note that
-# the include paths will already be set by doxygen for the files and directories
-# specified with INPUT and INCLUDE_PATH.
-# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
-
-CLANG_OPTIONS          =
-
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
 #---------------------------------------------------------------------------
@@ -1058,13 +1123,6 @@ CLANG_OPTIONS          =
 
 ALPHABETICAL_INDEX     = YES
 
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
 # In case all classes in a project start with a common prefix, all classes will
 # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
 # can be used to specify a prefix (or a list of prefixes) that should be ignored
@@ -1165,7 +1223,7 @@ HTML_EXTRA_FILES       =
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
 # this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
 # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
 # purple, and 360 is red again.
 # Minimum value: 0, maximum value: 359, default value: 220.
@@ -1201,6 +1259,17 @@ HTML_COLORSTYLE_GAMMA  = 80
 
 HTML_TIMESTAMP         = YES
 
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
 # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
 # documentation will contain sections that can be hidden and shown after the
 # page has loaded.
@@ -1224,13 +1293,14 @@ HTML_INDEX_NUM_ENTRIES = 100
 
 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
 # generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
@@ -1269,8 +1339,8 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
+# (see:
+# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows.
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
 # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
@@ -1300,7 +1370,7 @@ CHM_FILE               =
 HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
+# (YES) or that it should be included in the main .chm file (NO).
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
@@ -1345,7 +1415,8 @@ QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
 # The default value is: org.doxygen.Project.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1353,8 +1424,8 @@ QHP_NAMESPACE          = org.doxygen.Project
 
 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
 # Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
 # The default value is: doc.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1362,30 +1433,30 @@ QHP_VIRTUAL_FOLDER     = doc
 
 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
 # filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_SECT_FILTER_ATTRS  =
 
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHG_LOCATION           =
@@ -1462,6 +1533,17 @@ TREEVIEW_WIDTH         = 250
 
 EXT_LINKS_IN_WINDOW    = NO
 
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
 # Use this tag to change the font size of LaTeX formulas included as images in
 # the HTML documentation. When you change the font size after a successful
 # doxygen run you need to manually remove any form_*.png images from the HTML
@@ -1471,7 +1553,7 @@ EXT_LINKS_IN_WINDOW    = NO
 
 FORMULA_FONTSIZE       = 10
 
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
 # generated for formulas are transparent PNGs. Transparent PNGs are not
 # supported properly for IE 6.0, but are supported on all modern browsers.
 #
@@ -1482,8 +1564,14 @@ FORMULA_FONTSIZE       = 10
 
 FORMULA_TRANSPARENT    = YES
 
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE      =
+
 # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
 # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
 # installed or if you want to formulas look prettier in the HTML output. When
 # enabled you may also need to install MathJax separately and configure the path
@@ -1495,7 +1583,7 @@ USE_MATHJAX            = NO
 
 # When MathJax is enabled you can set the default output format to be used for
 # the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details.
 # Possible values are: HTML-CSS (which is slower, but has the best
 # compatibility), NativeMML (i.e. MathML) and SVG.
 # The default value is: HTML-CSS.
@@ -1510,8 +1598,8 @@ MATHJAX_FORMAT         = HTML-CSS
 # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
 # Content Delivery Network so you can quickly see the result without installing
 # MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
@@ -1525,7 +1613,8 @@ MATHJAX_EXTENSIONS     =
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
@@ -1553,7 +1642,7 @@ MATHJAX_CODEFILE       =
 SEARCHENGINE           = YES
 
 # When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
+# implemented using a web server instead of a web client using JavaScript. There
 # are two flavors of web server based searching depending on the EXTERNAL_SEARCH
 # setting. When disabled, doxygen will generate a PHP script for searching and
 # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
@@ -1572,7 +1661,8 @@ SERVER_BASED_SEARCH    = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
+# Xapian (see:
+# https://xapian.org/).
 #
 # See the section "External Indexing and Searching" for details.
 # The default value is: NO.
@@ -1585,8 +1675,9 @@ EXTERNAL_SEARCH        = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
 SEARCHENGINE_URL       =
@@ -1637,21 +1728,35 @@ LATEX_OUTPUT           = latex
 # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
 # invoked.
 #
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_CMD_NAME         = latex
 
 # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
 # index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
 # The default file is: makeindex.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 MAKEINDEX_CMD_NAME     = makeindex
 
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
 # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
 # documents. This may be useful for small projects and may help to save some
 # trees in general.
@@ -1736,9 +1841,11 @@ LATEX_EXTRA_FILES      =
 
 PDF_HYPERLINKS         = YES
 
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
 # The default value is: YES.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1772,12 +1879,28 @@ LATEX_SOURCE_CODE      = NO
 
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
 # The default value is: plain.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_BIB_STYLE        = plain
 
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the RTF output
 #---------------------------------------------------------------------------
@@ -1817,9 +1940,9 @@ COMPACT_RTF            = NO
 
 RTF_HYPERLINKS         = YES
 
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
 #
 # See also section "Doxygen usage" for information on how to generate the
 # default style sheet that doxygen normally uses.
@@ -1828,8 +1951,8 @@ RTF_HYPERLINKS         = YES
 RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_EXTENSIONS_FILE    =
@@ -1915,6 +2038,13 @@ XML_OUTPUT             = xml
 
 XML_PROGRAMLISTING     = YES
 
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
 #---------------------------------------------------------------------------
 # Configuration options related to the DOCBOOK output
 #---------------------------------------------------------------------------
@@ -1947,9 +2077,9 @@ DOCBOOK_PROGRAMLISTING = NO
 #---------------------------------------------------------------------------
 
 # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
 # The default value is: NO.
 
 GENERATE_AUTOGEN_DEF   = NO
@@ -2116,12 +2246,6 @@ EXTERNAL_GROUPS        = YES
 
 EXTERNAL_PAGES         = YES
 
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /sw/bin/perl
-
 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
@@ -2135,15 +2259,6 @@ PERL_PATH              = /sw/bin/perl
 
 CLASS_DIAGRAMS         = YES
 
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
@@ -2241,10 +2356,32 @@ UML_LOOK               = NO
 # but if the number exceeds 15, the total amount of fields shown is limited to
 # 10.
 # Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
+# This tag requires that the tag UML_LOOK is set to YES.
 
 UML_LIMIT_NUM_FIELDS   = 10
 
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
 # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
 # collaboration graphs will show the relations between templates and their
 # instances.
@@ -2371,6 +2508,11 @@ DIAFILE_DIRS           =
 
 PLANTUML_JAR_PATH      =
 
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
 # When using plantuml, the specified paths are searched for files specified by
 # the !include statement in a plantuml block.
 
@@ -2429,9 +2571,11 @@ DOT_MULTI_TARGETS      = NO
 
 GENERATE_LEGEND        = YES
 
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
 # files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc and
+# plantuml temporary files.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_CLEANUP            = YES
diff --git a/lapack-netlib/DOCS/DoxygenLayout.xml b/lapack-netlib/DOCS/DoxygenLayout.xml
new file mode 100644
index 000000000..aeb346d8d
--- /dev/null
+++ b/lapack-netlib/DOCS/DoxygenLayout.xml
@@ -0,0 +1,197 @@
+<doxygenlayout version="1.0">
+  <!-- Generated by doxygen 1.8.13 -->
+  <!-- Navigation index tabs for HTML output -->
+  <navindex>
+    <tab type="mainpage" visible="yes" title=""/>
+    <tab type="pages" visible="yes" title="" intro=""/>
+
+    <!-- LAPACK: change "Modules" to "Routines" -->
+    <tab type="modules" visible="yes" title="Routines" intro=""/>
+
+    <tab type="namespaces" visible="yes" title="">
+      <tab type="namespacelist" visible="yes" title="" intro=""/>
+      <tab type="namespacemembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="classes" visible="yes" title="">
+      <tab type="classlist" visible="yes" title="" intro=""/>
+      <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/>
+      <tab type="hierarchy" visible="yes" title="" intro=""/>
+      <tab type="classmembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="files" visible="yes" title="">
+      <tab type="filelist" visible="yes" title="" intro=""/>
+      <tab type="globals" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="examples" visible="yes" title="" intro=""/>
+  </navindex>
+
+  <!-- Layout definition for a class page -->
+  <class>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <inheritancegraph visible="$CLASS_GRAPH"/>
+    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
+    <memberdecl>
+      <nestedclasses visible="yes" title=""/>
+      <publictypes title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <publicslots title=""/>
+      <signals title=""/>
+      <publicmethods title=""/>
+      <publicstaticmethods title=""/>
+      <publicattributes title=""/>
+      <publicstaticattributes title=""/>
+      <protectedtypes title=""/>
+      <protectedslots title=""/>
+      <protectedmethods title=""/>
+      <protectedstaticmethods title=""/>
+      <protectedattributes title=""/>
+      <protectedstaticattributes title=""/>
+      <packagetypes title=""/>
+      <packagemethods title=""/>
+      <packagestaticmethods title=""/>
+      <packageattributes title=""/>
+      <packagestaticattributes title=""/>
+      <properties title=""/>
+      <events title=""/>
+      <privatetypes title=""/>
+      <privateslots title=""/>
+      <privatemethods title=""/>
+      <privatestaticmethods title=""/>
+      <privateattributes title=""/>
+      <privatestaticattributes title=""/>
+      <friends title=""/>
+      <related title="" subtitle=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <constructors title=""/>
+      <functions title=""/>
+      <related title=""/>
+      <variables title=""/>
+      <properties title=""/>
+      <events title=""/>
+    </memberdef>
+    <allmemberslink visible="yes"/>
+    <usedfiles visible="$SHOW_USED_FILES"/>
+    <authorsection visible="yes"/>
+  </class>
+
+  <!-- Layout definition for a namespace page -->
+  <namespace>
+    <briefdescription visible="yes"/>
+    <memberdecl>
+      <nestednamespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </namespace>
+
+  <!-- Layout definition for a file page -->
+  <file>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <includegraph visible="$INCLUDE_GRAPH"/>
+    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
+    <sourcelink visible="yes"/>
+    <memberdecl>
+      <classes visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection/>
+  </file>
+
+  <!-- Layout definition for a group page -->
+  <group>
+    <briefdescription visible="yes"/>
+    <groupgraph visible="$GROUP_GRAPHS"/>
+    <memberdecl>
+      <nestedgroups visible="yes" title=""/>
+      <dirs visible="yes" title=""/>
+      <files visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <pagedocs/>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </group>
+
+  <!-- Layout definition for a directory page -->
+  <directory>
+    <briefdescription visible="yes"/>
+    <directorygraph visible="yes"/>
+    <memberdecl>
+      <dirs visible="yes"/>
+      <files visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+  </directory>
+</doxygenlayout>
diff --git a/lapack-netlib/DOCS/groups-usr.dox b/lapack-netlib/DOCS/groups-usr.dox
index 6c31e4cf5..cbd747165 100644
--- a/lapack-netlib/DOCS/groups-usr.dox
+++ b/lapack-netlib/DOCS/groups-usr.dox
@@ -1,605 +1,974 @@
 /**
- *
- ****
- *
- * @defgroup lapack LAPACK
- *   This is the group of LAPACK routines.
- *
- * @defgroup GE  General Matrices
- *   @ingroup lapack
- *   This is the group of General Matrices routines
- * @defgroup GB  General Band Matrix
- *   @ingroup lapack
- *   This is the group of General Band routines
- * @defgroup SY  Symmetric Matrix
- *   @ingroup lapack
- *   This is the group of Symmetric Matrix routines
- * @defgroup HE  Hermitian Matrix
- *   @ingroup lapack
- *   This is the group of Hermitian Matrix routines
- * @defgroup PO  Positive Definite Matrix
- *   @ingroup lapack
- *   This is the group of Positive Definite routines
- * @defgroup GT  General tridiagonal Matrix
- *   @ingroup lapack
- *   This is the group of General tridiagonal routines
- * @defgroup PT  Positive Definite tridiagonal Matrix
- *   @ingroup lapack
- *   This is the group of Positive Definite tridiagonal routines
- * @defgroup OTHEReigen  Eigenvalue
- *   @ingroup lapack
- *   This is the group of Other Eigenvalue routines
- * @defgroup OTHERauxiliary  Other Auxiliary Routines
- *   @ingroup lapack
- *   This is the group of Other Auxiliary routines
- * @defgroup OTHERcomputational  Other Computational Routines
- *   @ingroup lapack
- *   This is the group of Other Computational routines
- * @defgroup OTHERsolve  Other Solve Routines
- *   @ingroup lapack
- *   This is the group of Other Solve routines
- *
- ****
- *
- * @defgroup solveGE  Linear Solve
- *   @ingroup GE
- *   This is the group of Linear Solve Driver routines
- * @defgroup solveGB  Linear Solve
- *   @ingroup GB
- *   This is the group of Linear Solve Driver routines
- * @defgroup solveSY  Linear Solve
- *   @ingroup SY
- *   This is the group of Linear Solve Driver routines
- * @defgroup solveHE  Linear Solve
- *   @ingroup HE
- *   This is the group of Linear Solve Driver routines
- * @defgroup solvePO  Linear Solve
- *   @ingroup PO
- *   This is the group of Linear Solve Driver routines
- * @defgroup solveGT  Linear Solve
- *   @ingroup GT
- *   This is the group of Linear Solve Driver routines
- * @defgroup solvePT  Linear Solve
- *   @ingroup PT
- *   This is the group of Linear Solve Driver routines
- * @defgroup eigenGE  Eigenvalue
- *   @ingroup GE
- *   This is the group of Eigenvalue Driver routines
- * @defgroup eigenSY  Eigenvalue
- *   @ingroup SY
- *   This is the group of Eigenvalue Driver routines
- * @defgroup eigenHE  Eigenvalue
- *   @ingroup HE
- *   This is the group of Eigenvalue Driver routines
- * @defgroup singGE  Singular Value
- *   @ingroup GE
- *   This is the group of Singular Value Driver routines
- * @defgroup computationalGE  Computational routines
- *   @ingroup GE
- *   This is the group of Computational routines
- * @defgroup variantsGEcomputational  Variants Computational routines
- *   @ingroup GE
- *   This is the group of Variants Computational routines
- * @defgroup computationalGB  Computational routines
- *   @ingroup GB
- *   This is the group of Computational routines
- * @defgroup computationalSY  Computational routines
- *   @ingroup SY
- *   This is the group of Computational routines
- * @defgroup computationalHE  Computational routines
- *   @ingroup HE
- *   This is the group of Computational routines
- * @defgroup computationalPO  Computational routines
- *   @ingroup PO
- *   This is the group of Computational routines
- * @defgroup variantsPOcomputational  Variants Computational routines
- *   @ingroup PO
- *   This is the group of Variants Computational routines
- * @defgroup computationalGT  Computational routines
- *   @ingroup GT
- *   This is the group of Computational routines
- * @defgroup computationalPT  Computational routines
- *   @ingroup PT
- *   This is the group of Computational routines
- * @defgroup variantsOTHERcomputational  Variants Computational routines
- *   @ingroup OTHERcomputational
- *   This is the group of Variants Computational routines
- * @defgroup auxiliaryGE  Auxiliary routines
- *   @ingroup GE
- *   This is the group of Auxiliary routines
- * @defgroup auxiliaryGB  Auxiliary routines
- *   @ingroup GB
- *   This is the group of Auxiliary routines
- * @defgroup auxiliarySY  Auxiliary routines
- *   @ingroup SY
- *   This is the group of Auxiliary routines
- * @defgroup auxiliaryHE  Auxiliary routines
- *   @ingroup HE
- *   This is the group of Auxiliary routines
- * @defgroup auxiliaryPO  Auxiliary routines
- *   @ingroup PO
- *   This is the group of Auxiliary routines
- * @defgroup auxiliaryGT  Auxiliary routines
- *   @ingroup GT
- *   This is the group of Auxiliary routines
- * @defgroup auxiliaryPT  Auxiliary routines
- *   @ingroup PT
- *   This is the group of Auxiliary routines
- *
- ****
- *
- * @defgroup doubleGEsolve double
- *   @ingroup solveGE
- *   This is the group of double solve driver functions for GE matrices
- * @defgroup doublePOsolve double
- *   @ingroup solvePO
- *   This is the group of double solve driver functions for PO matrices
- * @defgroup doubleSYsolve double
- *   @ingroup solveSY
- *   This is the group of double solve driver functions for SY matrices
- * @defgroup doubleGBsolve double
- *   @ingroup solveGB
- *   This is the group of double solve driver functions for GB matrices
- * @defgroup doubleGTsolve double
- *   @ingroup solveGT
- *   This is the group of double solve driver functions for GT matrices
- * @defgroup doublePTsolve double
- *   @ingroup solvePT
- *   This is the group of double solve driver functions for PT matrices
- * @defgroup doubleGEeigen double
- *   @ingroup eigenGE
- *   This is the group of double eigenvalue driver functions for GE matrices
- * @defgroup doubleSYeigen double
- *   @ingroup eigenSY
- *   This is the group of double eigenvalue driver functions for SY matrices
- * @defgroup doubleGEsing double
- *   @ingroup singGE
- *   This is the group of double singular value driver functions for GE matrices
- * @defgroup doubleGEcomputational double
- *   @ingroup computationalGE
- *   This is the group of double computational functions for GE matrices
- * @defgroup doublePOcomputational double
- *   @ingroup computationalPO
- *   This is the group of double computational functions for PO matrices
- * @defgroup doubleSYcomputational double
- *   @ingroup computationalSY
- *   This is the group of double computational functions for SY matrices
- * @defgroup doubleGBcomputational double
- *   @ingroup computationalGB
- *   This is the group of double computational functions for GB matrices
- * @defgroup doubleGTcomputational double
- *   @ingroup computationalGT
- *   This is the group of double computational functions for GT matrices
- * @defgroup doublePTcomputational double
- *   @ingroup computationalPT
- *   This is the group of double computational functions for PT matrices
- * @defgroup doubleGEauxiliary double
- *   @ingroup auxiliaryGE
- *   This is the group of double auxiliary functions for GE matrices
- * @defgroup doublePOauxiliary double
- *   @ingroup auxiliaryPO
- *   This is the group of double auxiliary functions for PO matrices
- * @defgroup doubleSYauxiliary double
- *   @ingroup auxiliarySY
- *   This is the group of double auxiliary functions for SY matrices
- * @defgroup doubleGBauxiliary double
- *   @ingroup auxiliaryGB
- *   This is the group of double auxiliary functions for GB matrices
- * @defgroup doublePTauxiliary double
- *   @ingroup auxiliaryPT
- *   This is the group of double auxiliary functions for PT matrices
- * @defgroup doubleGTauxiliary double
- *   @ingroup auxiliaryGT
- *   This is the group of double auxiliary functions for GT matrices
- * @defgroup doubleOTHERauxiliary  double
- *   @ingroup OTHERauxiliary
- *   This is the group of double other auxiliary routines
- * @defgroup doubleOTHERcomputational  double
- *   @ingroup OTHERcomputational
- *   This is the group of double other Computational routines
- * @defgroup doubleOTHERsolve  double Other Solve Routines
- *   @ingroup OTHERsolve
- *   This is the group of double Other Solve routines
- * @defgroup doubleOTHEReigen  double
- *   @ingroup OTHEReigen
- *   This is the group of double Other Eigenvalue routines
- *
- ****
- *
- * @defgroup realGEsolve real
- *   @ingroup solveGE
- *   This is the group of real solve driver functions for GE matrices
- * @defgroup realPOsolve real
- *   @ingroup solvePO
- *   This is the group of real solve driver functions for PO matrices
- * @defgroup realSYsolve real
- *   @ingroup solveSY
- *   This is the group of real solve driver functions for SY matrices
- * @defgroup realGBsolve real
- *   @ingroup solveGB
- *   This is the group of real solve driver functions for GB matrices
- * @defgroup realGTsolve real
- *   @ingroup solveGT
- *   This is the group of real solve driver functions for GT matrices
- * @defgroup realPTsolve real
- *   @ingroup solvePT
- *   This is the group of real solve driver functions for PT matrices
- * @defgroup realGEeigen real
- *   @ingroup eigenGE
- *   This is the group of real eigenvalue driver functions for GE matrices
- * @defgroup realSYeigen real
- *   @ingroup eigenSY
- *   This is the group of real eigenvalue driver functions for SY matrices
- * @defgroup realGEsing real
- *   @ingroup singGE
- *   This is the group of real singular value driver functions for GE matrices
- * @defgroup realGEcomputational real
- *   @ingroup computationalGE
- *   This is the group of real computational functions for GE matrices
- * @defgroup realPOcomputational real
- *   @ingroup computationalPO
- *   This is the group of real computational functions for PO matrices
- * @defgroup realSYcomputational real
- *   @ingroup computationalSY
- *   This is the group of real computational functions for SY matrices
- * @defgroup realGBcomputational real
- *   @ingroup computationalGB
- *   This is the group of real computational functions for GB matrices
- * @defgroup realPTcomputational real
- *   @ingroup computationalPT
- *   This is the group of real computational functions for PT matrices
- * @defgroup realGTcomputational real
- *   @ingroup computationalGT
- *   This is the group of real computational functions for GT matrices
- * @defgroup realGEauxiliary real
- *   @ingroup auxiliaryGE
- *   This is the group of real auxiliary functions for GE matrices
- * @defgroup realPOauxiliary real
- *   @ingroup auxiliaryPO
- *   This is the group of real auxiliary functions for PO matrices
- * @defgroup realSYauxiliary real
- *   @ingroup auxiliarySY
- *   This is the group of real auxiliary functions for SY matrices
- * @defgroup realGBauxiliary real
- *   @ingroup auxiliaryGB
- *   This is the group of real auxiliary functions for GB matrices
- * @defgroup realGTauxiliary real
- *   @ingroup auxiliaryGT
- *   This is the group of real auxiliary functions for GT matrices
- * @defgroup realPTauxiliary real
- *   @ingroup auxiliaryPT
- *   This is the group of real auxiliary functions for PT matrices
- * @defgroup realOTHERauxiliary  real
- *   @ingroup OTHERauxiliary
- *   This is the group of real other auxiliary routines
- * @defgroup realOTHERcomputational  real
- *   @ingroup OTHERcomputational
- *   This is the group of real other Computational routines
- * @defgroup realOTHERsolve  real Other Solve Routines
- *   @ingroup OTHERsolve
- *   This is the group of real Other Solve routines
- * @defgroup realOTHEReigen  real
- *   @ingroup OTHEReigen
- *   This is the group of real Other Eigenvalue routines
- *
- ****
- *
- * @defgroup complexGEsolve complex
- *   @ingroup solveGE
- *   This is the group of complex solve driver functions for GE matrices
- * @defgroup complexPOsolve complex
- *   @ingroup solvePO
- *   This is the group of complex solve driver functions for PO matrices
- * @defgroup complexSYsolve complex
- *   @ingroup solveSY
- *   This is the group of complex solve driver functions for SY matrices
- * @defgroup complexHEsolve complex
- *   @ingroup solveHE
- *   This is the group of complex solve driver functions for HE matrices
- * @defgroup complexGBsolve complex
- *   @ingroup solveGB
- *   This is the group of complex solve driver functions for GB matrices
- * @defgroup complexGTsolve complex
- *   @ingroup solveGT
- *   This is the group of complex solve driver functions for GT matrices
- * @defgroup complexPTsolve complex
- *   @ingroup solvePT
- *   This is the group of complex solve driver functions for PT matrices
- * @defgroup complexGEeigen complex
- *   @ingroup eigenGE
- *   This is the group of complex eigenvalue driver functions for GE matrices
- * @defgroup complexSYeigen complex
- *   @ingroup eigenSY
- *   This is the group of complex eigenvalue driver functions for SY matrices
- * @defgroup complexHEeigen complex
- *   @ingroup eigenHE
- *   This is the group of complex eigenvalue driver functions for HE matrices
- * @defgroup complexGEsing complex
- *   @ingroup singGE
- *   This is the group of complex singular value driver functions for GE matrices
- * @defgroup complexGEcomputational complex
- *   @ingroup computationalGE
- *   This is the group of complex computational functions for GE matrices
- * @defgroup complexPOcomputational complex
- *   @ingroup computationalPO
- *   This is the group of complex computational functions for PO matrices
- * @defgroup complexSYcomputational complex
- *   @ingroup computationalSY
- *   This is the group of complex computational functions for SY matrices
- * @defgroup complexHEcomputational complex
- *   @ingroup computationalHE
- *   This is the group of complex computational functions for HE matrices
- * @defgroup complexGBcomputational complex
- *   @ingroup computationalGB
- *   This is the group of complex computational functions for GB matrices
- * @defgroup complexGTcomputational complex
- *   @ingroup computationalGT
- *   This is the group of complex computational functions for GT matrices
- * @defgroup complexPTcomputational complex
- *   @ingroup computationalPT
- *   This is the group of complex computational functions for PT matrices
- * @defgroup complexGEauxiliary complex
- *   @ingroup auxiliaryGE
- *   This is the group of complex auxiliary functions for GE matrices
- * @defgroup complexPOauxiliary complex
- *   @ingroup auxiliaryPO
- *   This is the group of complex auxiliary functions for PO matrices
- * @defgroup complexSYauxiliary complex
- *   @ingroup auxiliarySY
- *   This is the group of complex auxiliary functions for SY matrices
- * @defgroup complexHEauxiliary complex
- *   @ingroup auxiliaryHE
- *   This is the group of complex auxiliary functions for HE matrices
- * @defgroup complexGBauxiliary complex
- *   @ingroup auxiliaryGB
- *   This is the group of complex auxiliary functions for GB matrices
- * @defgroup complexOTHERauxiliary  complex
- *   @ingroup OTHERauxiliary
- *   This is the group of complex other auxiliary routines
- * @defgroup complexOTHERcomputational  complex
- *   @ingroup OTHERcomputational
- *   This is the group of complex other Computational routines
- * @defgroup complexOTHERsolve  complex Other Solve Routines
- *   @ingroup OTHERsolve
- *   This is the group of complex Other Solve routines
- * @defgroup complexOTHEReigen  complex Other Eigenvalue routines
- *   @ingroup OTHEReigen
- *   This is the group of complex Other Eigenvalue routines
- *
- ****
- *
- * @defgroup complex16GEsolve complex16
- *   @ingroup solveGE
- *   This is the group of complex16 solve driver functions for GE matrices
- * @defgroup complex16POsolve complex16
- *   @ingroup solvePO
- *   This is the group of complex16 solve driver functions for PO matrices
- * @defgroup complex16SYsolve complex16
- *   @ingroup solveSY
- *   This is the group of complex16 solve driver functions for SY matrices
- * @defgroup complex16HEsolve complex16
- *   @ingroup solveHE
- *   This is the group of complex16 solve driver functions for HE matrices
- * @defgroup complex16GBsolve complex16
- *   @ingroup solveGB
- *   This is the group of complex16 solve driver functions for GB matrices
- * @defgroup complex16GTsolve complex16
- *   @ingroup solveGT
- *   This is the group of complex16 solve driver functions for GT matrices
- * @defgroup complex16PTsolve complex16
- *   @ingroup solvePT
- *   This is the group of complex16 solve driver functions for PT matrices
- * @defgroup complex16GEeigen complex16
- *   @ingroup eigenGE
- *   This is the group of complex16 eigenvalue driver functions for GE matrices
- * @defgroup complex16SYeigen complex16
- *   @ingroup eigenSY
- *   This is the group of complex16 eigenvalue driver functions for SY matrices
- * @defgroup complex16HEeigen complex16
- *   @ingroup eigenHE
- *   This is the group of complex16 eigenvalue driver functions for HE matrices
- * @defgroup complex16GEsing complex16
- *   @ingroup singGE
- *   This is the group of complex16 singular value driver functions for GE matrices
- * @defgroup complex16GEcomputational complex16
- *   @ingroup computationalGE
- *   This is the group of complex16 computational functions for GE matrices
- * @defgroup complex16POcomputational complex16
- *   @ingroup computationalPO
- *   This is the group of complex16 computational functions for PO matrices
- * @defgroup complex16SYcomputational complex16
- *   @ingroup computationalSY
- *   This is the group of complex16 computational functions for SY matrices
- * @defgroup complex16HEcomputational complex16
- *   @ingroup computationalHE
- *   This is the group of complex16 computational functions for HE matrices
- * @defgroup complex16GBcomputational complex16
- *   @ingroup computationalGB
- *   This is the group of complex16 computational functions for GB matrices
- * @defgroup complex16GTcomputational complex16
- *   @ingroup computationalGT
- *   This is the group of complex16 computational functions for GT matrices
- * @defgroup complex16PTcomputational complex16
- *   @ingroup computationalPT
- *   This is the group of complex16 computational functions for PT matrices
- * @defgroup complex16GEauxiliary complex16
- *   @ingroup auxiliaryGE
- *   This is the group of complex16 auxiliary functions for GE matrices
- * @defgroup complex16POauxiliary complex16
- *   @ingroup auxiliaryPO
- *   This is the group of complex16 auxiliary functions for PO matrices
- * @defgroup complex16SYauxiliary complex16
- *   @ingroup auxiliarySY
- *   This is the group of complex16 auxiliary functions for SY matrices
- * @defgroup complex16HEauxiliary complex16
- *   @ingroup auxiliaryHE
- *   This is the group of complex16 auxiliary functions for HE matrices
- * @defgroup complex16GBauxiliary complex16
- *   @ingroup auxiliaryGB
- *   This is the group of complex16 auxiliary functions for GB matrices
- * @defgroup complex16OTHERcomputational  complex16
- *   @ingroup OTHERcomputational
- *   This is the group of complex16 other Computational routines
- * @defgroup complex16OTHERauxiliary  complex16
- *   @ingroup OTHERauxiliary
- *   This is the group of complex16 other auxiliary routines
- * @defgroup auxOTHERcomputational  auxiliary Computational routines
- *   @ingroup OTHERcomputational
- *   This is the group of auxiliary Computational routines
- * @defgroup complex16OTHERsolve  complex16 Other Solve Routines
- *   @ingroup OTHERsolve
- *   This is the group of complex16 Other Solve routines
- * @defgroup complex16OTHEReigen  complex16 Other Eigenvalue routines
- *   @ingroup OTHEReigen
- *   This is the group of complex16 Other Eigenvalue routines
- *
- ****
- *
- * @defgroup testing LAPACK Testing
- *   This is the group of LAPACK TESTING routines.
- *
- * @defgroup matgen Matrix Generation
- *   @ingroup testing
- *   This is the group of LAPACK TESTING MATGEN routines.
- *
- * @defgroup lin Linear Solve
- *   @ingroup testing
- *   This is the group of LAPACK TESTING LIN routines.
- *
- * @defgroup eig Eigenvalue and Singular value
- *   @ingroup testing
- *   This is the group of LAPACK TESTING EIG routines.
- *
- * @defgroup real_matgen real
- *   @ingroup matgen
- *   This is the group of real LAPACK TESTING MATGEN routines.
- *
- * @defgroup double_matgen double
- *   @ingroup matgen
- *   This is the group of double LAPACK TESTING MATGEN routines.
- *
- * @defgroup complex_matgen complex
- *   @ingroup matgen
- *   This is the group of complex LAPACK TESTING MATGEN routines.
- *
- * @defgroup complex16_matgen complex16
- *   @ingroup matgen
- *   This is the group of complex16 LAPACK TESTING MATGEN routines.
- *
- * @defgroup aux_matgen aux
- *   @ingroup matgen
- *   This is the group of auxiliary LAPACK TESTING MATGEN routines.
- *
- * @defgroup single_lin real
- *   @ingroup lin
- *   This is the group of real LAPACK TESTING LIN routines.
- *
- * @defgroup double_lin double
- *   @ingroup lin
- *   This is the group of double LAPACK TESTING LIN routines.
- *
- * @defgroup complex_lin complex
- *   @ingroup lin
- *   This is the group of complex LAPACK TESTING LIN routines.
- *
- * @defgroup complex16_lin complex16
- *   @ingroup lin
- *   This is the group of complex16 LAPACK TESTING LIN routines.
- *
- * @defgroup aux_lin aux
- *   @ingroup lin
- *   This is the group of auxiliary LAPACK TESTING LIN routines.
- *
- * @defgroup single_eig real
- *   @ingroup eig
- *   This is the group of real LAPACK TESTING EIG routines.
- *
- * @defgroup double_eig double
- *   @ingroup eig
- *   This is the group of double LAPACK TESTING EIG routines.
- *
- * @defgroup complex_eig complex
- *   @ingroup eig
- *   This is the group of complex LAPACK TESTING EIG routines.
- *
- * @defgroup complex16_eig complex16
- *   @ingroup eig
- *   This is the group of complex16 LAPACK TESTING EIG routines.
- *
- * @defgroup aux_eig aux
- *   @ingroup eig
- *   This is the group of auxiliary LAPACK TESTING EIG routines.
- *
- ****
- * @defgroup blas Reference BLAS
- *   This is the group of BLAS routines.
- *
- * @defgroup level1 Level1
- *   @ingroup blas
- *   This is the group of LEVEL 1 BLAS routines.
- * @defgroup level2 Level2
- *   @ingroup blas
- *   This is the group of LEVEL 2 BLAS routines.
- * @defgroup level3 Level3
- *   @ingroup blas
- *   This is the group of LEVEL 3 BLAS routines.
-  * @defgroup aux_blas Auxiliary BLAS
- *   @ingroup blas
- *   This is the group of Auxiliary 3 BLAS routines.
-* @defgroup blastesting Testing
- *   @ingroup blas
- *   This is the group of BLAS TESTING routines.
- *
- * @defgroup single_blas_level1 real
- *   @ingroup level1
- *   This is the group of real LEVEL 1 BLAS routines.
- * @defgroup double_blas_level1 double
- *   @ingroup level1
- *   This is the group of double LEVEL 1 BLAS routines.
- * @defgroup complex_blas_level1 complex
- *   @ingroup level1
- *   This is the group of complex LEVEL 1 BLAS routines.
- * @defgroup complex16_blas_level1 complex16
- *   @ingroup level1
- *   This is the group of complex16 LEVEL 1 BLAS routines.
- *
- * @defgroup single_blas_level2 real
- *   @ingroup level2
- *   This is the group of real LEVEL 2 BLAS routines.
- * @defgroup double_blas_level2 double
- *   @ingroup level2
- *   This is the group of double LEVEL 2 BLAS routines.
- * @defgroup complex_blas_level2 complex
- *   @ingroup level2
- *   This is the group of complex LEVEL 2 BLAS routines.
- * @defgroup complex16_blas_level2 complex16
- *   @ingroup level2
- *   This is the group of complex16 LEVEL 2 BLAS routines.
- *
- * @defgroup single_blas_level3 real
- *   @ingroup level3
- *   This is the group of real LEVEL 3 BLAS routines.
- * @defgroup double_blas_level3 double
- *   @ingroup level3
- *   This is the group of double LEVEL 3 BLAS routines.
- * @defgroup complex_blas_level3 complex
- *   @ingroup level3
- *   This is the group of complex LEVEL 3 BLAS routines.
- * @defgroup complex16_blas_level3 complex16
- *   @ingroup level3
- *   This is the group of complex16 LEVEL 3 BLAS routines.
- *
- * @defgroup single_blas_testing real
- *   @ingroup blastesting
- *   This is the group of real BLAS TESTING routines.
- * @defgroup double_blas_testing double
- *   @ingroup blastesting
- *   This is the group of double BLAS TESTING routines.
- * @defgroup complex_blas_testing complex
- *   @ingroup blastesting
- *   This is the group of complex BLAS TESTING routines.
- * @defgroup complex16_blas_testing complex16
- *   @ingroup blastesting
- *   This is the group of complex16 BLAS TESTING routines.
- *
+
+@defgroup lapack_top                LAPACK
+@{
+    @defgroup solve_top             Linear solve, AX = B
+    @{
+        @defgroup gesv_driver_grp   LU: General matrix, driver
+        @{
+            @defgroup gesv_driver   --- full ---
+            @defgroup gesv          gesv:           factor and solve
+            @defgroup gesvx         gesvx:          factor and solve, expert
+            @defgroup gesvxx        gesvxx:         factor and solve, extra precise
+            @defgroup gesv_mixed    gesv:           factor and solve, mixed precision
+
+            @defgroup gbsv_driver   --- banded ---
+            @defgroup gbsv          gbsv:           factor and solve
+            @defgroup gbsvx         gbsvx:          factor and solve, expert
+            @defgroup gbsvxx        gbsvxx:         factor and solve, extra precise
+
+            @defgroup gtsv_driver   --- tridiagonal ---
+            @defgroup gtsv          gtsv:           factor and solve
+            @defgroup gtsvx         gtsvx:          factor and solve, expert
+        @}
+
+        @defgroup gesv_comp_grp     LU: computational routines (factor, cond, etc.)
+        @{
+            @defgroup gesv_comp     --- full ---
+            @defgroup gecon         gecon:          condition number estimate
+            @defgroup getrf         getrf:          triangular factor
+            @defgroup getrf2        getrf2:         triangular factor panel, recursive?
+            @defgroup getf2         getf2:          triangular factor panel, level 2
+            @defgroup getrs         getrs:          triangular solve using factor
+            @defgroup getri         getri:          triangular inverse
+            @defgroup gerfs         gerfs:          iterative refinement
+            @defgroup gerfsx        gerfsx:         iterative refinement, expert
+            @defgroup geequ         geequ:          equilibration
+            @defgroup geequb        geequb:         equilibration, power of 2
+            @defgroup laqge         laqge:          row/col scale matrix
+            @defgroup laswp         laswp:          swap permutation
+
+            @defgroup getc2         getc2:          triangular factor, with complete pivoting
+            @defgroup gesc2         gesc2:          triangular solve using factor, with complete pivoting
+            @defgroup latdf         latdf:          Dif-estimate with complete pivoting LU, step in tgsen
+
+            @defgroup la_gercond    la_gercond: Skeel condition number estimate
+            @defgroup la_gerpvgrw   la_gerpvgrw: reciprocal pivot growth
+            @defgroup la_gerfsx_extended  la_gerfsx_extended:  step in gerfsx
+
+            @defgroup gbsv_comp     --- banded ---
+            @defgroup gbcon         gbcon:          condition number estimate
+            @defgroup gbtrf         gbtrf:          triangular factor
+            @defgroup gbtf2         gbtf2:          triangular factor, level 2
+            @defgroup gbtrs         gbtrs:          triangular solve using factor
+            @defgroup gbrfs         gbrfs:          iterative refinement
+            @defgroup gbrfsx        gbrfsx:         iterative refinement, expert
+            @defgroup gbequ         gbequ:          equilibration
+            @defgroup gbequb        gbequb:         equilibration, power of 2
+            @defgroup laqgb         laqgb:          row/col scale matrix
+            @defgroup la_gbrcond    la_gbrcond:     Skeel condition number estimate
+            @defgroup la_gbrpvgrw   la_gbrpvgrw:    reciprocal pivot growth
+            @defgroup la_gbrfsx_extended  la_gbrfsx_extended:  step in gbrfsx
+
+            @defgroup gtsv_comp     --- tridiagonal ---
+            @defgroup gtcon         gtcon:          condition number estimate
+            @defgroup gttrf         gttrf:          triangular factor
+            @defgroup gttrs         gttrs:          triangular solve using factor
+            @defgroup gtts2         gtts2:          triangular solve using factor
+            @defgroup gtrfs         gtrfs:          iterative refinement
+        @}
+
+        @defgroup posv_driver_grp   Cholesky: Hermitian/symmetric positive definite matrix, driver
+        @{
+            @defgroup posv_driver   --- full ---
+            @defgroup posv          posv:           factor and solve
+            @defgroup posvx         posvx:          factor and solve, expert
+            @defgroup posvxx        posvxx:         factor and solve, extra precise
+            @defgroup posv_mixed    posv:           factor and solve, mixed precision
+
+            @defgroup ppsv_driver   --- packed ---
+            @defgroup ppsv          ppsv:           factor and solve
+            @defgroup ppsvx         ppsvx:          factor and solve, expert
+
+            @defgroup pfsv_driver   --- rectangular full packed (RFP) ---
+            @defgroup pfsv          pfsv:           factor and solve [not available]
+
+            @defgroup pbsv_driver   --- banded ---
+            @defgroup pbsv          pbsv:           factor and solve
+            @defgroup pbsvx         pbsvx:          factor and solve, expert
+
+            @defgroup ptsv_driver   --- tridiagonal ---
+            @defgroup ptsv          ptsv:           factor and solve
+            @defgroup ptsvx         ptsvx:          factor and solve, expert
+        @}
+
+        @defgroup posv_comp_grp     Cholesky: computational routines (factor, cond, etc.)
+        @{
+            @defgroup posv_comp     --- full ---
+            @defgroup pocon         pocon:          condition number estimate
+            @defgroup potrf         potrf:          triangular factor
+            @defgroup potrf2        potrf2:         triangular factor panel, recursive?
+            @defgroup potf2         potf2:          triangular factor panel, level 2
+            @defgroup pstrf         pstrf:          triangular factor, with pivoting
+            @defgroup pstf2         pstf2:          triangular factor, with pivoting panel, level 2
+            @defgroup potrs         potrs:          triangular solve using factor
+            @defgroup potri         potri:          triangular inverse
+            @defgroup porfs         porfs:          iterative refinement
+            @defgroup porfsx        porfsx:         iterative refinement, expert
+            @defgroup poequ         poequ:          equilibration
+            @defgroup poequb        poequb:         equilibration, power of 2
+            @defgroup laqhe         laqhe:          row/col scale matrix
+            @defgroup la_porcond    la_porcond:     Skeel condition number estimate
+            @defgroup la_porpvgrw   la_porpvgrw:    reciprocal pivot growth
+            @defgroup la_porfsx_extended  la_porfsx_extended:  step in porfsx
+
+            @defgroup ppsv_comp     --- packed ---
+            @defgroup ppcon         ppcon:          condition number estimate
+            @defgroup pptrf         pptrf:          triangular factor
+            @defgroup pptrs         pptrs:          triangular solve using factor
+            @defgroup pptri         pptri:          triangular inverse
+            @defgroup pprfs         pprfs:          iterative refinement
+            @defgroup ppequ         ppequ:          equilibration
+            @defgroup laqhp         laqhp:          row/col scale matrix
+
+            @defgroup pfsv_comp     --- rectangular full packed (RFP) ---
+            @defgroup pftrf         pftrf:          triangular factor
+            @defgroup pftrs         pftrs:          triangular solve using factor
+            @defgroup pftri         pftri:          triangular inverse
+
+            @defgroup pbsv_comp     --- banded ---
+            @defgroup pbcon         pbcon:          condition number estimate
+            @defgroup pbtrf         pbtrf:          triangular factor
+            @defgroup pbtf2         pbtf2:          triangular factor panel, level 2
+            @defgroup pbtrs         pbtrs:          triangular solve using factor
+            @defgroup pbrfs         pbrfs:          iterative refinement
+            @defgroup pbequ         pbequ:          equilibration
+            @defgroup laqhb         laqhb:          row/col scale matrix
+
+            @defgroup ptsv_comp     --- tridiagonal ---
+            @defgroup ptcon         ptcon:          condition number estimate
+            @defgroup pttrf         pttrf:          triangular factor
+            @defgroup pttrs         pttrs:          triangular solve using factor
+            @defgroup ptts2         ptts2:          triangular solve using factor, unblocked
+            @defgroup ptrfs         ptrfs:          iterative refinement
+        @}
+
+        @defgroup hesv_driver_grp   LDL: Hermitian/symmetric indefinite matrix, driver
+        @{
+            @defgroup hesv_driver   --- full, rook pivoting ---
+            @defgroup hesv          {he,sy}sv:      rook (v1)
+            @defgroup hesv_rook     {he,sy}sv_rook: rook (v2)
+            @defgroup hesv_rk       {he,sy}sv_rk:   rook (v3)
+            @defgroup hesvx         {he,sy}svx:     rook (v1, expert)
+            @defgroup hesvxx        {he,sy}svxx:    rook (v1, expert)
+
+            @defgroup hpsv_driver   --- packed, rook pivoting ---
+            @defgroup hpsv          {hp,sp}sv:      factor and solve
+            @defgroup hpsvx         {hp,sp}svx:     factor and solve, expert
+
+            @defgroup hesv_aa_driver  --- full, Aasen ---
+            @defgroup hesv_aa       {he,sy}sv_aa:   Aasen
+            @defgroup hesv_aa_2stage  {he,sy}sv_aa_2stage:  Aasen, blocked 2-stage
+        @}
+
+        @defgroup hesv_comp_grp     LDL: computational routines (factor, cond, etc.)
+        @{
+            @defgroup hesv_comp_v1  --- full, rook v1 ---
+            @defgroup hecon         {he,sy}con:     condition number estimate
+            @defgroup hetrf         {he,sy}trf:     triangular factor
+            @defgroup lahef         la{he,sy}f:     step in hetrf
+            @defgroup hetf2         {he,sy}tf2:     triangular factor, level 2
+            @defgroup hetrs         {he,sy}trs:     triangular solve using factor
+            @defgroup hetri         {he,sy}tri:     triangular inverse
+            @defgroup herfs         {he,sy}rfs:     iterative refinement
+            @defgroup herfsx        {he,sy}rfsx:    iterative refinement, expert
+            @defgroup heequb        {he,sy}equb:    equilibration, power of 2
+            @defgroup syconv        syconv:         convert to/from L and D from hetrf
+
+            @defgroup hecon_3       {he,sy}con_3:   condition number estimate
+            @defgroup hetri2        {he,sy}tri2:    inverse
+            @defgroup hetri2x       {he,sy}tri2x:   inverse
+            @defgroup hetri_3       {he,sy}tri_3:   inverse
+            @defgroup hetri_3x      {he,sy}tri_3x:  inverse
+            @defgroup hetrs2        {he,sy}trs2:    solve using factor
+            @defgroup hetrs_3       {he,sy}trs_3:   solve using factor
+
+            @defgroup heswapr             {he,sy}swapr:         apply 2-sided permutation
+            @defgroup la_hercond          la_hercond:           Skeel condition number estimate
+            @defgroup la_herfsx_extended  la_herfsx_extended:   step in herfsx
+            @defgroup la_herpvgrw         la_herpvgrw:          reciprocal pivot growth
+
+            @defgroup hpsv_comp     --- packed, rook v1 ---
+            @defgroup hpcon         {hp,sp}con:     condition number estimate
+            @defgroup hptrf         {hp,sp}trf:     triangular factor
+            @defgroup hptrs         {hp,sp}trs:     triangular solve using factor
+            @defgroup hptri         {hp,sp}tri:     triangular inverse
+            @defgroup hprfs         {hp,sp}rfs:     iterative refinement
+
+            @defgroup hesv_comp_v2  --- full, rook v2 ---
+            @defgroup hecon_rook    {he,sy}con_rook:  condition number estimate
+            @defgroup hetrf_rook    {he,sy}trf_rook:  triangular factor
+            @defgroup lahef_rook    la{he,sy}f_rook:  triangular factor step
+            @defgroup hetf2_rook    {he,sy}tf2_rook:  triangular factor, level 2
+            @defgroup hetrs_rook    {he,sy}trs_rook:  triangular solve using factor
+            @defgroup hetri_rook    {he,sy}tri_rook:  triangular inverse
+
+            @defgroup hesv_comp_v3  --- full, rook v3 ---
+            @defgroup hetrf_rk      {he,sy}trf_rk:  triangular factor
+            @defgroup lahef_rk      la{he,sy}f_rk:  triangular factor step
+            @defgroup hetf2_rk      {he,sy}tf2_rk:  triangular factor, level 2
+            @defgroup syconvf       syconvf:        convert to/from hetrf      to hetrf_rk format
+            @defgroup syconvf_rook  syconvf_rook:   convert to/from hetrf_rook to hetrf_rk format
+
+            @defgroup hesv_comp_aasen  --- full, Aasen ---
+            @defgroup hetrf_aa      {he,sy}trf_aa:  triangular factor
+            @defgroup lahef_aa      la{he,sy}f_aa:  triangular factor partial factor
+            @defgroup hetrs_aa      {he,sy}trs_aa:  triangular solve using factor
+
+            @defgroup hesv_comp_aasen2 --- full, Aasen, blocked 2-stage ---
+            @defgroup hetrf_aa_2stage  {he,sy}trf_aa_2stage:  triangular factor
+            @defgroup hetrs_aa_2stage  {he,sy}trs_aa_2stage:  triangular solve using factor
+        @}
+
+        @defgroup trsv_comp_grp     Triangular computational routines (solve, cond, etc.)
+        @{
+            @defgroup trsv_comp     --- full ---
+            @defgroup trcon         trcon:          condition number estimate
+            @defgroup trtrs         trtrs:          triangular solve
+            @defgroup latrs         latrs:          triangular solve with robust scaling
+            @defgroup latrs3        latrs3:         triangular solve with robust scaling, level 3
+            @defgroup trtri         trtri:          triangular inverse
+            @defgroup trti2         trti2:          triangular inverse, level 2
+            @defgroup trrfs         trrfs:          triangular iterative refinement
+            @defgroup lauum         lauum:          triangular multiply: U^H U
+            @defgroup lauu2         lauu2:          triangular multiply: U^H U, level 2
+
+            @defgroup tpsv_comp     --- packed ---
+            @defgroup tpcon         tpcon:          condition number estimate
+            @defgroup tptrs         tptrs:          triangular solve
+            @defgroup latps         latps:          triangular solve with robust scaling
+            @defgroup tptri         tptri:          triangular inverse
+            @defgroup tprfs         tprfs:          triangular iterative refinement
+
+            @defgroup tfsv_comp     --- rectangular full packed (RFP) ---
+            @defgroup tftri         tftri:          triangular inverse, RFP
+
+            @defgroup tbsv_comp     --- banded ---
+            @defgroup tbcon         tbcon:          condition number estimate
+            @defgroup tbtrs         tbtrs:          triangular solve
+            @defgroup latbs         latbs:          triangular solve with scaling
+            @defgroup tbrfs         tbrfs:          triangular iterative refinement
+        @}
+
+        @defgroup solve_aux_grp     Auxiliary routines
+        @{
+            @defgroup lacn2         lacn2:          1-norm estimate, e.g., || A^{-1} ||_1 in gecon
+            @defgroup lacon         lacon:          1-norm estimate, e.g., || A^{-1} ||_1 in gecon, old
+            @defgroup la_lin_berr   la_lin_berr:    backward error
+        @}
+    @}
+
+    @defgroup gels_top Least squares
+    @{
+        @defgroup gels_driver_grp   Standard least squares, min || Ax - b ||_2
+        @{
+            @defgroup gels          gels:           least squares using QR/LQ
+            @defgroup gelst         gelst:          least squares using QR/LQ with T matrix
+            @defgroup gelss         gelss:          least squares using SVD, QR iteration
+            @defgroup gelsd         gelsd:          least squares using SVD, divide and conquer
+            @defgroup gelsy         gelsy:          least squares using complete orthogonal factor
+            @defgroup getsls        getsls:         least squares using tall-skinny QR/LQ
+        @}
+
+        @defgroup ggls_driver_grp   Constrained least squares
+        @{
+            @defgroup gglse         gglse:          equality-constrained least squares
+            @defgroup ggglm         ggglm:          Gauss-Markov linear model
+        @}
+
+        @defgroup gels_aux_grp      Auxiliary routines
+        @{
+            @defgroup laic1         laic1:          condition estimate, step in gelsy
+            @defgroup lals0         lals0:          back multiplying factors, step in gelsd
+            @defgroup lalsa         lalsa:          SVD of coefficient matrix, step in gelsd
+            @defgroup lalsd         lalsd:          uses SVD for least squares, step in gelsd
+        @}
+    @}
+
+    @defgroup unitary_top Orthogonal/unitary factors (QR, CS, etc.)
+    @{
+        @defgroup geqr_comp_grp     QR
+        @{
+            @defgroup geqr_comp1    --- flexible ---
+            @defgroup geqr          geqr:           QR factor, flexible
+            @defgroup gemqr         gemqr:          multiply by Q from geqr
+
+            @defgroup geqr_comp2    --- classic ---
+            @defgroup geqrf         geqrf:          QR factor
+            @defgroup geqr2         geqr2:          QR factor, level 2
+            @defgroup ungqr         {un,or}gqr:     generate explicit Q from geqrf
+            @defgroup ung2r         {un,or}g2r:     generate explicit Q from geqrf, level 2
+            @defgroup unmqr         {un,or}mqr:     multiply by Q from geqrf
+            @defgroup unm2r         {un,or}m2r:     multiply by Q from geqrf, level 2
+
+            @defgroup geqr_comp3    --- with T ---
+            @defgroup geqrt         geqrt:          QR factor, with T
+            @defgroup geqrt2        geqrt2:         QR factor, with T, level 2
+            @defgroup geqrt3        geqrt3:         QR factor, with T, recursive panel
+            @defgroup gemqrt        gemqrt:         multiply by Q from geqrt
+
+            @defgroup geqr_comp4    --- positive ---
+            @defgroup geqrfp        geqrfp:         QR factor, diag( R ) ≥ 0
+            @defgroup geqr2p        geqr2p:         QR factor, diag( R ) ≥ 0, level 2
+        @}
+
+        @defgroup geqpf_comp_grp    QR with pivoting
+        @{
+            @defgroup geqp3         geqp3:          QR factor with pivoting, level 3
+            @defgroup laqp2         laqp2:          step of geqp3
+            @defgroup laqps         laqps:          step of geqp3
+        @}
+
+        @defgroup getsqr_comp_grp   QR, tall-skinny
+        @{
+            @defgroup latsqr        latsqr:         tall-skinny QR factor
+            @defgroup ungtsqr       {un,or}gtsqr:   generate Q from latsqr
+            @defgroup ungtsqr_row   {un,or}gtsqr_row:   generate Q from latsqr
+            @defgroup larfb_gett    larfb_gett:     step in ungtsqr_row
+            @defgroup lamtsqr       lamtsqr:        multiply by Q from latsqr
+            @defgroup getsqrhrt     getsqrhrt:      tall-skinny QR factor, with Householder reconstruction
+            @defgroup unhr_col      {un,or}hr_col:  Householder reconstruction
+            @defgroup launhr_col_getrfnp   la{un,or}hr_col_getrfnp:   LU factor without pivoting
+            @defgroup launhr_col_getrfnp2  la{un,or}hr_col_getrfnp2:  LU factor without pivoting, level 2
+        @}
+
+        @defgroup tpqr_comp_grp     QR, triangular-pentagonal
+        @{
+            @defgroup tpqrt         tpqrt:          QR factor
+            @defgroup tpqrt2        tpqrt2:         QR factor, level 2
+            @defgroup tpmqrt        tpmqrt:         applies Q
+            @defgroup tprfb         tprfb:          applies Q (like larfb)
+        @}
+
+        @defgroup ggqr_comp_grp     Generalized QR
+        @{
+            @defgroup ggqrf         ggqrf:          Generalized QR factor
+        @}
+
+        @defgroup gelq_comp_grp     LQ
+        @{
+            @defgroup gelq_comp1    --- flexible ---
+            @defgroup gelq          gelq:           LQ factor, flexible
+            @defgroup gemlq         gemlq:          multiply by Q from gelq
+
+            @defgroup gelq_comp2    --- classic ---
+            @defgroup gelqf         gelqf:          LQ factor
+            @defgroup gelq2         gelq2:          LQ factor, level 2
+            @defgroup unglq         {un,or}glq:     generate explicit Q from gelqf
+            @defgroup ungl2         {un,or}gl2:     generate explicit Q, level 2, step in unglq
+            @defgroup unmlq         {un,or}mlq:     multiply by Q from gelqf
+            @defgroup unml2         {un,or}ml2:     multiply by Q, level 2, step in unmlq
+
+            @defgroup gelq_comp3    --- with T ---
+            @defgroup gelqt         gelqt:          LQ factor, with T
+            @defgroup gelqt3        gelqt3:         LQ factor, with T, recursive
+            @defgroup gemlqt        gemlqt:         multiply by Q from gelqt
+        @}
+
+        @defgroup geswlq_comp_grp   LQ, short-wide
+        @{
+            @defgroup laswlq        laswlq:         short-wide LQ factor
+            @defgroup lamswlq       lamswlq:        multiply by Q from laswlq
+        @}
+
+        @defgroup tplq_comp_grp     LQ, triangular-pentagonal
+        @{
+            @defgroup tplqt         tplqt:          QR factor
+            @defgroup tplqt2        tplqt2:         QR factor, level 2
+            @defgroup tpmlqt        tpmlqt:         applies Q
+        @}
+
+        @defgroup geql_comp_grp     QL
+        @{
+            @defgroup geqlf         geqlf:          QL factor
+            @defgroup geql2         geql2:          QL factor, level 2
+            @defgroup ungql         {un,or}gql:     generate explicit Q from geqlf
+            @defgroup unmql         {un,or}mql:     multiply by Q from geqlf
+            @defgroup ung2l         {un,or}g2l:     step in ungql
+            @defgroup unm2l         {un,or}m2l:     step in unmql
+        @}
+
+        @defgroup gerq_comp_grp     RQ
+        @{
+            @defgroup gerqf         gerqf:          RQ factor
+            @defgroup gerq2         gerq2:          RQ factor, level 2
+            @defgroup ungrq         {un,or}grq:     generate explicit Q from gerqf
+            @defgroup unmrq         {un,or}mrq:     multiply by Q from gerqf
+            @defgroup unmr2         {un,or}mr2:     step in unmrq
+            @defgroup ungr2         {un,or}gr2:     step in ungrq
+        @}
+
+        @defgroup ggrq_comp_grp     Generalized RQ
+        @{
+            @defgroup ggrqf         ggrqf:          Generalized RQ factor
+        @}
+
+        @defgroup gerz_comp_grp     RZ
+        @{
+            @defgroup tzrzf         tzrzf:          RZ factor
+            @defgroup latrz         latrz:          RZ factor step
+            @defgroup unmrz         {un,or}mrz:     multiply by Z from tzrzf
+            @defgroup unmr3         {un,or}mr3:     step in unmrz
+            @defgroup larz          larz:           apply reflector
+            @defgroup larzb         larzb:          apply block reflector
+            @defgroup larzt         larzt:          generate T matrix
+        @}
+
+        @defgroup gecs_comp_grp     Cosine-Sine (CS) decomposition
+        @{
+            @defgroup bbcsd         bbcsd:          ??
+            @defgroup uncsd         {un,or}csd:     ??
+            @defgroup uncsd2by1     {un,or}csd2by1: ??
+            @defgroup unbdb         {un,or}bdb:     bidiagonalize partitioned unitary matrix, step in uncsd
+            @defgroup unbdb1        {un,or}bdb1:    step in uncsd2by1
+            @defgroup unbdb2        {un,or}bdb2:    step in uncsd2by1
+            @defgroup unbdb3        {un,or}bdb3:    step in uncsd2by1
+            @defgroup unbdb4        {un,or}bdb4:    step in uncsd2by1
+            @defgroup unbdb5        {un,or}bdb5:    step in uncsd2by1
+            @defgroup unbdb6        {un,or}bdb6:    step in uncsd2by1
+
+            @defgroup lapmr         lapmr:          permute rows
+            @defgroup lapmt         lapmt:          permute cols
+        @}
+
+        @defgroup reflector_aux_grp Householder reflectors
+        @{
+            @defgroup larf          larf:           apply Householder reflector
+            @defgroup larfx         larfx:          apply Householder reflector, unrolled
+            @defgroup larfy         larfy:          apply Householder reflector symmetrically (2-sided)
+            @defgroup larfb         larfb:          apply block Householder reflector
+            @defgroup larfg         larfg:          generate Householder reflector
+            @defgroup larfgp        larfgp:         generate Householder reflector, beta ≥ 0
+            @defgroup larft         larft:          generate T matrix
+        @}
+
+        @defgroup rot_aux_grp       Givens/Jacobi plane rotations
+        @{
+            @defgroup lartg         lartg:          generate plane rotation, more accurate than BLAS rot
+            @defgroup lartgp        lartgp:         generate plane rotation, more accurate than BLAS rot
+            @defgroup lasr          lasr:           apply series of plane rotations
+            @defgroup largv         largv:          generate vector of plane rotations
+            @defgroup lartv         lartv:          apply vector of plane rotations to vectors
+            @defgroup lar2v         lar2v:          apply vector of plane rotations to 2x2 matrices
+            @defgroup lacrt         lacrt:          apply plane rotation (unused?)
+        @}
+    @}
+
+    @defgroup geev_top              Non-symmetric eigenvalues
+    @{
+        @defgroup geev_driver_grp   Standard eig driver, AV = VΛ
+        @{
+            @defgroup geev          geev:           eig
+            @defgroup geevx         geevx:          eig, expert
+
+            @defgroup gees          gees:           Schur form
+            @defgroup geesx         geesx:          Schur form, expert
+        @}
+
+        @defgroup ggev_driver_grp   Generalized eig driver
+        @{
+            @defgroup ggev3         ggev3:          eig
+            @defgroup ggev          ggev:           eig, unblocked
+            @defgroup ggevx         ggevx:          eig, expert
+
+            @defgroup gges3         gges3:          Schur form
+            @defgroup gges          gges:           Schur form, unblocked
+            @defgroup ggesx         ggesx:          Schur form, expert
+        @}
+
+        @defgroup gedmd             DMD driver, Dynamic Mode Decomposition
+
+        @defgroup geev_comp_grp     Eig computational routines
+        @{
+            @defgroup gebal         gebal:          balance matrix
+            @defgroup gehrd         gehrd:          reduction to Hessenberg
+            @defgroup gehd2         gehd2:          reduction to Hessenberg, level 2
+            @defgroup lahr2         lahr2:          step in gehrd
+            @defgroup unghr         {un,or}ghr:     generate Q from gehrd
+            @defgroup unmhr         {un,or}mhr:     multiply by Q from gehrd
+            @defgroup gebak         gebak:          back-transform eigvec
+            @defgroup hseqr         hseqr:          Hessenberg eig, QR iteration
+            @defgroup hsein         hsein:          Hessenberg inverse iteration for eigvec
+            @defgroup trevc         trevc:          eigenvectors of triangular Schur form, old
+            @defgroup trevc3        trevc3:         eigenvectors of triangular Schur form, blocked
+            @defgroup laln2         laln2:          1x1 or 2x2 solve, step in trevc
+
+            @defgroup trsyl         trsyl:          Sylvester equation
+            @defgroup trsyl3        trsyl3:         Sylvester equation, level 3
+            @defgroup lasy2         lasy2:          Sylvester equation
+
+            @defgroup trsna         trsna:          eig condition numbers
+            @defgroup laqtr         laqtr:          quasi-triangular solve
+
+            @defgroup trexc         trexc:          reorder Schur form
+            @defgroup trsen         trsen:          reorder Schur form
+            @defgroup laexc         laexc:          reorder Schur form
+
+            @defgroup lanv2         lanv2:          2x2 Schur factor
+
+            @defgroup laqr_group    --- hseqr auxiliary ---
+            @defgroup laein         laein:          eigvec by Hessenberg inverse iteration
+            @defgroup lahqr         lahqr:          eig of Hessenberg, step in hseqr
+            @defgroup laqr0         laqr0:          eig of Hessenberg, step in hseqr
+            @defgroup laqr1         laqr1:          step in hseqr
+            @defgroup laqr2         laqr2:          step in hseqr
+            @defgroup laqr3         laqr3:          step in hseqr
+            @defgroup laqr4         laqr4:          eig of Hessenberg, step in hseqr
+            @defgroup laqr5         laqr5:          step in hseqr
+
+            @defgroup iparmq        iparmq:         set parameters for hseqr
+
+            @defgroup laqz_group    --- ggev3, gges3 auxiliary ---
+            @defgroup laqz0         laqz0:          step in ggev3, gges3
+            @defgroup laqz1         laqz1:          step in ggev3, gges3
+            @defgroup laqz2         laqz2:          step in ggev3, gges3
+            @defgroup laqz3         laqz3:          step in ggev3, gges3
+            @defgroup laqz4         laqz4:          step in ggev3, gges3
+        @}
+
+        @defgroup ggev_comp_grp     Generalized eig computational routines
+        @{
+            @defgroup ggbal         ggbal:          balance matrix
+            @defgroup gghrd         gghrd:          reduction to Hessenberg
+            @defgroup gghd3         gghd3:          reduction to Hessenberg, level 3
+            @defgroup hgeqz         hgeqz:          generalized Hessenberg eig
+            @defgroup ggbak         ggbak:          back-transform eigvec
+            @defgroup tgsen         tgsen:          reorder generalized Schur form
+            @defgroup tgsna         tgsna:          reciprocal cond est
+            @defgroup tgsyl         tgsyl:          Sylvester equation
+            @defgroup tgsy2         tgsy2:          Sylvester equation panel (?)
+            @defgroup unm22         {un,or}m22:     multiply by banded Q, step in gghd3
+            @defgroup lagv2         lagv2:          2x2 generalized Schur factor
+            @defgroup tgevc         tgevc:          eigvec of pair of matrices
+            @defgroup tgexc         tgexc:          reorder generalized Schur form
+            @defgroup tgex2         tgex2:          reorder generalized Schur form
+        @}
+    @}
+
+    @defgroup heev_top              Hermitian/symmetric eigenvalues
+    @{
+        @defgroup heev_driver_grp   Standard eig driver, AV = VΛ
+        @{
+            @defgroup heev_driver   --- full ---
+            @defgroup heev          {he,sy}ev:      eig, QR iteration
+            @defgroup heevd         {he,sy}evd:     eig, divide and conquer
+            @defgroup heevr         {he,sy}evr:     eig, MRRR
+            @defgroup heevx         {he,sy}evx:     eig, bisection
+
+            @defgroup heev_driver2  --- full, 2-stage ---
+            @defgroup heev_2stage   {he,sy}ev_2stage:   eig, QR iteration
+            @defgroup heevd_2stage  {he,sy}evd_2stage:  eig, divide and conquer
+            @defgroup heevr_2stage  {he,sy}evr_2stage:  eig, MRRR
+            @defgroup heevx_2stage  {he,sy}evx_2stage:  eig, bisection
+
+            @defgroup hpev_driver   --- packed ---
+            @defgroup hpev          {hp,sp}ev:      eig, QR iteration
+            @defgroup hpevd         {hp,sp}evd:     eig, divide and conquer
+            @defgroup hpevx         {hp,sp}evx:     eig, bisection
+
+            @defgroup hbev_driver   --- banded ---
+            @defgroup hbev          {hb,sb}ev:      eig, QR iteration
+            @defgroup hbevd         {hb,sb}evd:     eig, divide and conquer
+            @defgroup hbevx         {hb,sb}evx:     eig, bisection
+
+            @defgroup hbev_driver2  --- banded, 2nd-stage ---
+            @defgroup hbev_2stage   {hb,sb}ev_2stage:   eig, QR iteration
+            @defgroup hbevd_2stage  {hb,sb}evd_2stage:  eig, divide and conquer
+            @defgroup hbevx_2stage  {hb,sb}evx_2stage:  eig, bisection
+
+            @defgroup stev_driver   --- tridiagonal ---
+            @defgroup stev          stev:           eig, QR iteration
+            @defgroup stevd         stevd:          eig, divide and conquer
+            @defgroup stevr         stevr:          eig, MRRR
+            @defgroup stevx         stevx:          eig, bisection
+            @defgroup pteqr         pteqr:          eig, positive definite tridiagonal
+
+            @defgroup stebz         stebz:          eig, Kahan
+            @defgroup sterf         sterf:          eig, QR iteration
+            @defgroup stedc         stedc:          eig, divide and conquer
+            @defgroup stegr         stegr:          eig, bisection, see stemr
+            @defgroup stein         stein:          eig, inverse iteration
+            @defgroup stemr         stemr:          eig, relatively robust representation (RRR)
+            @defgroup steqr         steqr:          eig, QR iteration
+        @}
+
+        @defgroup hegv_driver_grp   Generalized eig driver, AV = BVΛ, etc.
+        @{
+            @defgroup hegv_driver   --- full ---
+            @defgroup hegv          {he,sy}gv:      eig, QR iteration
+            @defgroup hegv_2stage   {he,sy}gv_2stage: eig, QR iteration, 2-stage
+            @defgroup hegvd         {he,sy}gvd:     eig, divide and conquer
+            @defgroup hegvx         {he,sy}gvx:     eig, bisection
+
+            @defgroup hpgv_driver   --- packed ---
+            @defgroup hpgv          {hp,sp}gv:      eig, QR iteration
+            @defgroup hpgvd         {hp,sp}gvd:     eig, divide and conquer
+            @defgroup hpgvx         {hp,sp}gvx:     eig, bisection
+
+            @defgroup hbgv_driver   --- banded ---
+            @defgroup hbgv          {hb,sb}gv:      eig, QR iteration
+            @defgroup hbgvd         {hb,sb}gvd:     eig, divide and conquer
+            @defgroup hbgvx         {hb,sb}gvx:     eig, bisection
+        @}
+
+        @defgroup heev_comp_grp     Eig computational routines
+        @{
+            @defgroup heev_comp     --- full ---
+            @defgroup disna         disna:          eig condition numbers
+            @defgroup hetrd         {he,sy}trd:     reduction to tridiagonal
+            @defgroup hetd2         {he,sy}td2:     reduction to tridiagonal, level 2
+            @defgroup latrd         latrd:          step in hetrd
+            @defgroup ungtr         {un,or}gtr:     generate Q from hetrd
+            @defgroup unmtr         {un,or}mtr:     multiply by Q from hetrd
+
+            @defgroup hetrd_2stage  {he,sy}trd_2stage: reduction to tridiagonal, 2-stage
+            @defgroup hetrd_he2hb   {he,sy}trd_he2hb:   full to band (1st stage)
+            @defgroup hetrd_hb2st   {he,sy}trd_hb2st:   band to tridiagonal (2nd stage)
+            @defgroup hb2st_kernels {hb,sb}2st_kernels: band to tridiagonal (2nd stage)
+
+            @defgroup lae2          lae2:           2x2 eig, step in steqr, stemr
+            @defgroup laesy         laesy:          2x2 eig
+            @defgroup laev2         laev2:          2x2 eig
+            @defgroup lagtf         lagtf:          LU factor of (T - λI)
+            @defgroup lagts         lagts:          LU solve  of (T - λI) x = y
+
+            @defgroup hpev_comp     --- packed ---
+            @defgroup hptrd         {hp,sp}trd:     reduction to tridiagonal
+            @defgroup upgtr         {up,op}gtr:     generate Q from hetrd
+            @defgroup upmtr         {up,op}mtr:     multiply by Q from hptrd
+
+            @defgroup hbev_comp     --- banded ---
+            @defgroup hbtrd         {hb,sb}trd:     reduction to tridiagonal
+        @}
+
+        @defgroup hegv_comp_grp     Generalized eig computational routines
+        @{
+            @defgroup hegst         {he,sy}gst:     reduction to standard form
+            @defgroup hegs2         {he,sy}gs2:     reduction to standard form, level 2
+            @defgroup hpgst         {hp,sp}gst:     reduction to standard form, packed
+            @defgroup hbgst         {hb,sb}gst:     reduction to standard form, banded
+            @defgroup pbstf         pbstf:          split Cholesky factor, use with hbgst
+            @defgroup lag2          lag2:           2x2 eig
+        @}
+
+        @defgroup stev_comp_grp     tridiag bisection routines
+        @{
+            @defgroup laebz         laebz:          counts eigvals <= value
+            @defgroup laneg         laneg:          Sturm count
+        @}
+
+        @defgroup laed_comp_grp     tridiag divide and conquer (D&C) routines
+        @{
+            @defgroup laed0         laed0:          D&C step: top level solver
+            @defgroup laed1         laed1:          D&C step: merge subproblems
+            @defgroup laed2         laed2:          D&C step: deflation
+            @defgroup laed3         laed3:          D&C step: secular equation
+            @defgroup laed4         laed4:          D&C step: secular equation nonlinear solver
+            @defgroup laed5         laed5:          D&C step: secular equation, 2x2
+            @defgroup laed6         laed6:          D&C step: secular equation Newton step
+            @defgroup lamrg         lamrg:          permutation to merge 2 sorted lists
+
+            @defgroup laed_comp2    --- eig value only or update Q ---
+            @defgroup laed7         laed7:          D&C step: merge subproblems
+            @defgroup laed8         laed8:          D&C step: deflation
+            @defgroup laed9         laed9:          D&C step: secular equation
+            @defgroup laeda         laeda:          D&C step: z vector
+        @}
+
+        @defgroup larr_comp_grp     tridiag RRR routines
+        @{
+            @defgroup larra         larra:          step in stemr
+            @defgroup larrb         larrb:          step in stemr
+            @defgroup larrc         larrc:          step in stemr
+            @defgroup larrd         larrd:          step in stemr, tridiag eig
+            @defgroup larre         larre:          step in stemr
+            @defgroup larrf         larrf:          step in stemr, find relative robust representation (RRR)
+            @defgroup larrj         larrj:          step in stemr, refine eigval estimates
+            @defgroup larrk         larrk:          step in stemr, compute one eigval
+            @defgroup larrr         larrr:          step in stemr, test to do expensive tridiag eig algorithm
+            @defgroup larrv         larrv:          eig tridiagonal, step in stemr & stegr
+            @defgroup lar1v         lar1v:          step in larrv, hence stemr & stegr
+        @}
+    @}
+
+    @defgroup svd_top               Singular Value Decomposition (SVD)
+    @{
+        @defgroup svd_driver_grp    Standard SVD driver, A = UΣV^H
+        @{
+            @defgroup gesvd_driver  --- full ---
+            @defgroup gesvd         gesvd:          SVD, QR iteration
+            @defgroup gesvdq        gesvdq:         SVD, QR with pivoting
+            @defgroup gesdd         gesdd:          SVD, divide and conquer
+            @defgroup gesvdx        gesvdx:         SVD, bisection
+            @defgroup gejsv         gejsv:          SVD, Jacobi, high-level
+            @defgroup gesvj         gesvj:          SVD, Jacobi, low-level
+
+            @defgroup bdsvd_driver  --- bidiagonal ---
+            @defgroup bdsqr         bdsqr:          bidiagonal SVD, QR iteration (dqds)
+            @defgroup bdsdc         bdsdc:          bidiagonal SVD, divide and conquer
+            @defgroup bdsvdx        bdsvdx:         bidiagonal SVD, bisection
+        @}
+
+        @defgroup ggsvd_driver_grp  Generalized SVD driver
+        @{
+            @defgroup ggsvd3        ggsvd3:         SVD, QR iteration
+        @}
+
+        @defgroup gesvd_comp_grp    SVD computational routines
+        @{
+            @defgroup gebrd         gebrd:          reduction to bidiagonal
+            @defgroup gebd2         gebd2:          reduction to bidiagonal, level 2
+            @defgroup labrd         labrd:          step in gebrd
+            @defgroup gbbrd         gbbrd:          band to bidiagonal
+            @defgroup ungbr         {un,or}gbr:     generate Q, P from gebrd
+            @defgroup unmbr         {un,or}mbr:     multiply by Q, P from gebrd
+
+            @defgroup gesvd_aux     --- auxiliary routines ---
+            @defgroup gsvj0         gsvj0:          step in gesvj
+            @defgroup gsvj1         gsvj1:          step in gesvj
+            @defgroup las2          las2:           2x2 triangular SVD
+            @defgroup lasv2         lasv2:          2x2 triangular SVD
+            @defgroup lartgs        lartgs:         generate plane rotation for bidiag SVD
+        @}
+
+        @defgroup ggsvd_comp_grp    Generalized SVD computational routines
+        @{
+            @defgroup ggsvp3        ggsvp3:         step in ggsvd
+            @defgroup tgsja         tgsja:          generalized SVD of trapezoidal matrices, step in ggsvd3
+            @defgroup lags2         lags2:          2x2 orthogonal factor, step in tgsja
+            @defgroup lapll         lapll:          linear dependence of 2 vectors
+        @}
+
+        @defgroup lasq_comp_grp     bidiag QR iteration routines
+        @{
+            @defgroup lasq1         lasq1:          dqds step
+            @defgroup lasq2         lasq2:          dqds step
+            @defgroup lasq3         lasq3:          dqds step
+            @defgroup lasq4         lasq4:          dqds step
+            @defgroup lasq5         lasq5:          dqds step
+            @defgroup lasq6         lasq6:          dqds step
+        @}
+
+        @defgroup lasd_comp_grp     bidiag D&C routines
+        @{
+            @defgroup lasd0         lasd0:          D&C step: top level solver
+            @defgroup lasdt         lasdt:          D&C step: tree
+            @defgroup lasd1         lasd1:          D&C step: merge subproblems
+            @defgroup lasd2         lasd2:          D&C step: deflation
+            @defgroup lasd3         lasd3:          D&C step: secular equation
+            @defgroup lasd4         lasd4:          D&C step: secular equation nonlinear solver
+            @defgroup lasd5         lasd5:          D&C step: secular equation, 2x2
+            @defgroup lasdq         lasdq:          D&C step: leaf using bdsqr
+
+            @defgroup lasd_comp2    --- singular values only or factored form ---
+            @defgroup lasda         lasda:          D&C step: top level solver
+            @defgroup lasd6         lasd6:          D&C step: merge subproblems
+            @defgroup lasd7         lasd7:          D&C step: deflation
+            @defgroup lasd8         lasd8:          D&C step: secular equation
+        @}
+    @}
+
+    @defgroup blas_like_top         BLAS-like
+    @{
+        @defgroup set_grp           Initialize, copy, convert
+        @{
+            @defgroup laset         laset:          set matrix
+            @defgroup larnv         larnv:          random vector
+            @defgroup laruv         laruv:          random uniform vector
+
+            @defgroup lacpy         lacpy:          copy matrix
+
+            @defgroup lacp2         lacp2:          general matrix, convert real to complex
+            @defgroup _lag2_        _lag2_:         general matrix, convert double <=> single
+            @defgroup _lat2_        _lat2_:         triangular matrix, convert double <=> single
+
+            @defgroup tfttp         tfttp:          triangular matrix, RFP    (tf) to packed (tp)
+            @defgroup tfttr         tfttr:          triangular matrix, RFP    (tf) to full   (tr)
+            @defgroup tpttf         tpttf:          triangular matrix, packed (tp) to RFP    (tf)
+            @defgroup tpttr         tpttr:          triangular matrix, packed (tp) to full   (tr)
+            @defgroup trttf         trttf:          triangular matrix, full   (tr) to RFP    (tf)
+            @defgroup trttp         trttp:          triangular matrix, full   (tr) to packed (tp)
+        @}
+
+        @defgroup norm_grp          Matrix norm
+        @{
+            @defgroup lange         lange:          general matrix
+            @defgroup langb         langb:          general matrix, banded
+            @defgroup langt         langt:          general matrix, tridiagonal
+
+            @defgroup lanhs         lanhs:          Hessenberg
+
+            @defgroup lanhe         lan{he,sy}:     Hermitian/symmetric matrix
+            @defgroup lanhf         lan{hf,sf}:     Hermitian/symmetric matrix, RFP
+            @defgroup lanhp         lan{hp,sp}:     Hermitian/symmetric matrix, packed
+            @defgroup lanhb         lan{hb,sb}:     Hermitian/symmetric matrix, banded
+            @defgroup lanht         lan{ht,st}:     Hermitian/symmetric matrix, tridiagonal
+
+            @defgroup lantr         lantr:          triangular matrix
+            @defgroup lantp         lantp:          triangular matrix, packed
+            @defgroup lantb         lantb:          triangular matrix, banded
+        @}
+
+        @defgroup blas0_like_grp    Scalar operations
+        @{
+            @defgroup isnan         isnan:          test for NaN
+            @defgroup laisnan       laisnan:        test for NaN, unoptimized
+            @defgroup ladiv         ladiv:          complex divide
+            @defgroup lapy2         lapy2:          robust sqrt( x^2 + y^2 )
+            @defgroup lapy3         lapy3:          robust sqrt( x^2 + y^2 + z^2 )
+            @defgroup larmm         larmm:          scale factor to avoid overflow, step in latrs
+        @}
+
+        @defgroup blas1_like_grp    Level 1 BLAS-like vector ops
+        @{
+            @defgroup lacgv         lacgv:          conjugate vector
+            @defgroup lasrt         lasrt:          sort vector
+            @defgroup lassq         lassq:          sum-of-squares, avoiding over/underflow
+            @defgroup rscl          rscl:           scale vector by reciprocal
+        @}
+
+        @defgroup blas2_like_grp    Level 2 BLAS-like matrix-vector ops
+        @{
+            @defgroup ilalc         ilalc:          find non-zero col
+            @defgroup ilalr         ilalr:          find non-zero row
+            @defgroup lascl         lascl:          scale matrix
+            @defgroup la_geamv      la_geamv:       matrix-vector multiply |A| * |x|, general
+            @defgroup la_gbamv      la_gbamv:       matrix-vector multiply |A| * |x|, general banded
+            @defgroup la_heamv      la_heamv:       matrix-vector multiply |A| * |x|, Hermitian/symmetric
+            @defgroup lascl2        lascl2:         diagonal scale matrix, A = D A
+            @defgroup larscl2       larscl2:        reciprocal diagonal scale matrix, A = D^{-1} A
+            @defgroup la_wwaddw     la_wwaddw:      add to double-double or single-single vector
+        @}
+
+        @defgroup blas3_like_grp    Level 3 BLAS-like matrix-matrix ops
+        @{
+            @defgroup lagtm         lagtm:          tridiagonal matrix-matrix multiply
+            @defgroup lacrm         lacrm:          complex * real matrix-matrix multiply
+            @defgroup larcm         larcm:          real * complex matrix-matrix multiply
+            @defgroup hfrk          hfrk:           Hermitian rank-k update, RFP format
+            @defgroup tfsm          tfsm:           triangular-matrix solve, RFP format
+        @}
+    @}
+
+    @defgroup aux_top               Auxiliary routines
+    @{
+        @defgroup aux_grp           Other auxiliary routines
+        @{
+            @defgroup lsame         lsame:          string comparison
+            @defgroup lsamen        lsamen:         string comparison
+            @defgroup roundup_lwork roundup_lwork:  fix rounding integer to float
+            @defgroup second        second:         wall clock timer
+        @}
+
+        @defgroup params_grp        Parameters
+        @{
+            @defgroup lamch         lamch:          machine parameters
+            @defgroup lamc1         lamc1:          ??
+            @defgroup lamc2         lamc2:          ??
+            @defgroup lamc3         lamc3:          ??
+            @defgroup lamc4         lamc4:          ??
+            @defgroup lamc5         lamc5:          ??
+            @defgroup labad         labad:          over/underflow on obsolete pre-IEEE machines
+            @defgroup ilaver        ilaver:         LAPACK version
+            @defgroup ilaenv        ilaenv:         tuning parameters
+            @defgroup ilaenv2stage  ilaenv2stage:   tuning parameters for 2-stage eig
+            @defgroup iparam2stage  iparam2stage:   sets parameters for 2-stage eig
+            @defgroup ieeeck        ieeeck:         verify inf and NaN are safe
+            @defgroup la_constants  la_constants:   Fortran 95 module of constants
+
+            @defgroup blast_aux     --- BLAST constants ---
+            @defgroup iladiag       iladiag:        diag      string to BLAST const
+            @defgroup ilaprec       ilaprec:        precision string to BLAST const
+            @defgroup ilatrans      ilatrans:       trans     string to BLAST const
+            @defgroup ilauplo       ilauplo:        uplo      string to BLAST const
+            @defgroup la_transtype  la_transtype:   BLAST const to string
+        @}
+
+        @defgroup xerbla_grp        Error reporting
+        @{
+            @defgroup xerbla        xerbla:         error reporting
+            @defgroup xerbla_array  xerbla_array:   error reporting, callable from C
+        @}
+    @}
+@}
+
+@defgroup blas_top BLAS
+BLAS are defined by three papers:
+Basic linear algebra subprograms for {FORTRAN} usage, Lawson et al, 1979.
+An extended set of {FORTRAN} basic linear algebra subprograms, Dongarra et al, 1988.
+A set of level 3 basic linear algebra subprograms, Dongarra et al, 1990.
+Some BLAS-like routines (e.g., csymv, crot, csum1, icmax1) exist in
+LAPACK rather than the classic BLAS.
+These were extended by the Extra Precision BLAS (XBLAS, not documented here)
+https://www.netlib.org/xblas/
+
+@{
+        @defgroup scalar_grp        Scalar operations
+        @{
+            @defgroup abs1          abs1:           | real( x ) | + | imag( x ) |
+        @}
+
+        @defgroup blas1_grp         Level 1 BLAS: vector ops
+        @{
+            @defgroup asum          asum:           sum | real( x_i ) | + | imag( x_i ) |
+            @defgroup sum1          sum1:           sum | x_i | (in LAPACK)
+            @defgroup axpy          axpy:           y = ax + y
+            @defgroup copy          copy:           y = x
+            @defgroup dot           dot:            x^H x and x^T x
+            @defgroup iamax         iamax:          argmax_i | real( x_i ) | + | imag( x_i ) |
+            @defgroup imax1         imax1:          argmax_i | x_i | (in LAPACK)
+            @defgroup nrm2          nrm2:           || x ||_2
+            @defgroup scal          scal:           x = alpha x
+            @defgroup swap          swap:           x <=> y
+
+            @defgroup rot_comp      --- Givens/Jacobi plane rotations ---
+            @defgroup rot           rot:            apply plane rotation ([cz]rot in LAPACK)
+            @defgroup rotg          rotg:           generate plane rotation (cf. lartg)
+            @defgroup rotm          rotm:           apply modified (fast) plane rotation
+            @defgroup rotmg         rotmg:          generate modified (fast) plane rotation
+        @}
+
+        @defgroup blas2_grp         Level 2 BLAS: matrix-vector ops
+        @{
+            @defgroup blas2_full    --- full ---
+            @defgroup gemv          gemv:           general matrix-vector multiply
+            @defgroup ger           ger:            general matrix rank-1 update
+
+            @defgroup hemv          {he,sy}mv:      Hermitian/symmetric matrix-vector multiply ([cz]symv in LAPACK)
+            @defgroup her           {he,sy}r:       Hermitian/symmetric rank-1 update
+            @defgroup her2          {he,sy}r2:      Hermitian/symmetric rank-2 update
+
+            @defgroup trmv          trmv:           triangular matrix-vector multiply
+            @defgroup trsv          trsv:           triangular matrix-vector solve
+
+            @defgroup blas2_packed  --- packed ---
+            @defgroup hpmv          {hp,sp}mv:      Hermitian/symmetric matrix-vector multiply
+            @defgroup hpr           {hp,sp}r:       Hermitian/symmetric rank-1 update
+            @defgroup hpr2          {hp,sp}r2:      Hermitian/symmetric rank-2 update
+
+            @defgroup tpmv          tpmv:           triangular matrix-vector multiply
+            @defgroup tpsv          tpsv:           triangular matrix-vector solve
+
+            @defgroup blas2_banded  --- banded ---
+            @defgroup gbmv          gbmv:           general matrix-vector multiply
+
+            @defgroup hbmv          {hb,sb}mv:      Hermitian/symmetric matrix-vector multiply
+
+            @defgroup tbmv          tbmv:           triangular matrix-vector multiply
+            @defgroup tbsv          tbsv:           triangular matrix-vector solve
+        @}
+
+        @defgroup blas3_grp         Level 3 BLAS: matrix-matrix ops
+        @{
+            @defgroup gemm          gemm:           general matrix-matrix multiply
+
+            @defgroup hemm          {he,sy}mm:      Hermitian/symmetric matrix-matrix multiply
+            @defgroup herk          {he,sy}rk:      Hermitian/symmetric rank-k update
+            @defgroup her2k         {he,sy}r2k:     Hermitian/symmetric rank-2k update
+
+            @defgroup trmm          trmm:           triangular matrix-matrix multiply
+            @defgroup trsm          trsm:           triangular matrix-matrix solve
+        @}
+@}
+
 **/
diff --git a/lapack-netlib/INSTALL/ilaver.c b/lapack-netlib/INSTALL/ilaver.c
index 184e9b78d..d64c841a2 100644
--- a/lapack-netlib/INSTALL/ilaver.c
+++ b/lapack-netlib/INSTALL/ilaver.c
@@ -315,7 +315,6 @@ typedef struct Namelist Namelist;
 /* > \author Univ. of Colorado Denver */
 /* > \author NAG Ltd. */
 
-/* > \date November 2019 */
 
 /* > \ingroup auxOTHERauxiliary */
 
@@ -332,7 +331,7 @@ typedef struct Namelist Namelist;
 
 /*  ===================================================================== */
     *vers_major__ = 3;
-    *vers_minor__ = 11;
+    *vers_minor__ = 12;
     *vers_patch__ = 0;
 /*  ===================================================================== */
 
diff --git a/lapack-netlib/INSTALL/ilaver.f b/lapack-netlib/INSTALL/ilaver.f
index a246c37cb..1827d5cd2 100644
--- a/lapack-netlib/INSTALL/ilaver.f
+++ b/lapack-netlib/INSTALL/ilaver.f
@@ -44,7 +44,6 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
 *
 *> \ingroup auxOTHERauxiliary
 *
@@ -60,7 +59,7 @@
       INTEGER VERS_MAJOR, VERS_MINOR, VERS_PATCH
 *  =====================================================================
       VERS_MAJOR = 3
-      VERS_MINOR = 11
+      VERS_MINOR = 12
       VERS_PATCH = 0
 *  =====================================================================
 *
diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h
index c64fc4416..798a5eb2e 100644
--- a/lapack-netlib/LAPACKE/include/lapacke_config.h
+++ b/lapack-netlib/LAPACKE/include/lapacke_config.h
@@ -68,6 +68,17 @@ extern "C" {
 #endif
 
 #ifndef LAPACK_COMPLEX_CUSTOM
+#if defined(_MSC_VER)
+    #define _CRT_USE_C_COMPLEX_H
+    #include <complex.h>
+    #define LAPACK_COMPLEX_CUSTOM
+    #define lapack_complex_float _Fcomplex
+    #define lapack_complex_double _Dcomplex
+    #define lapack_complex_float_real(z)       (creal(z))
+    #define lapack_complex_float_imag(z)       (cimag(z))
+    #define lapack_complex_double_real(z)       (creal(z))
+    #define lapack_complex_double_imag(z)       (cimag(z))
+#else
 
 #if defined(LAPACK_COMPLEX_STRUCTURE)
 
@@ -109,6 +120,7 @@ typedef struct { double real, imag; } _lapack_complex_double;
 #define lapack_complex_double_real(z)       (creal(z))
 #define lapack_complex_double_imag(z)       (cimag(z))
 
+#endif
 #endif
 
 lapack_complex_float lapack_make_complex_float( float re, float im );
diff --git a/lapack-netlib/LICENSE b/lapack-netlib/LICENSE
index 94cdb0f85..96b04c988 100644
--- a/lapack-netlib/LICENSE
+++ b/lapack-netlib/LICENSE
@@ -1,9 +1,9 @@
-Copyright (c) 1992-2017 The University of Tennessee and The University
+Copyright (c) 1992-2023 The University of Tennessee and The University
                         of Tennessee Research Foundation.  All rights
                         reserved.
-Copyright (c) 2000-2017 The University of California Berkeley. All
+Copyright (c) 2000-2023 The University of California Berkeley. All
                         rights reserved.
-Copyright (c) 2006-2017 The University of Colorado Denver.  All rights
+Copyright (c) 2006-2023 The University of Colorado Denver.  All rights
                         reserved.
 
 $COPYRIGHT$
diff --git a/lapack-netlib/README.md b/lapack-netlib/README.md
index 142aa7b72..a00d4c51d 100644
--- a/lapack-netlib/README.md
+++ b/lapack-netlib/README.md
@@ -6,7 +6,7 @@
 [![Appveyor](https://ci.appveyor.com/api/projects/status/bh38iin398msrbtr?svg=true)](https://ci.appveyor.com/project/langou/lapack/)
 [![codecov](https://codecov.io/gh/Reference-LAPACK/lapack/branch/master/graph/badge.svg)](https://codecov.io/gh/Reference-LAPACK/lapack)
 [![Packaging status](https://repology.org/badge/tiny-repos/lapack.svg)](https://repology.org/metapackage/lapack/versions)
-
+[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/Reference-LAPACK/lapack/badge)](https://securityscorecards.dev/viewer/?uri=github.com/Reference-LAPACK/lapack)
 
 * VERSION 1.0   :  February 29, 1992
 * VERSION 1.0a  :  June 30, 1992
@@ -37,6 +37,7 @@
 * VERSION 3.10.0 : June 2021
 * VERSION 3.10.1 : April 2022
 * VERSION 3.11.0 : November 2022
+* VERSION 3.12.0 : November 2023
 
 LAPACK is a library of Fortran subroutines for solving the most commonly
 occurring problems in numerical linear algebra.
diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index 448fbd8df..de2242701 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -136,7 +136,7 @@ SLASRC_O = \
    sgebrd.o sgecon.o sgeequ.o sgees.o  sgeesx.o sgeev.o  sgeevx.o \
    sgehd2.o sgehrd.o sgelq2.o sgelqf.o \
    sgels.o  sgelsd.o sgelss.o sgelsy.o sgeql2.o sgeqlf.o \
-   sgeqp3.o sgeqr2.o sgeqr2p.o sgeqrf.o sgeqrfp.o sgerfs.o \
+   sgeqp3.o sgeqp3rk.o sgeqr2.o sgeqr2p.o sgeqrf.o sgeqrfp.o sgerfs.o \
    sgerq2.o sgerqf.o sgesc2.o sgesdd.o sgesv.o  sgesvd.o sgesvdx.o sgesvx.o \
    sgetc2.o sgetf2.o sgetri.o \
    sggbak.o sggbal.o sgges.o  sgges3.o sggesx.o \
@@ -151,7 +151,7 @@ SLASRC_O = \
    slangb.o slange.o slangt.o slanhs.o slansb.o slansp.o \
    slansy.o slantb.o slantp.o slantr.o slanv2.o \
    slapll.o slapmt.o \
-   slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \
+   slaqgb.o slaqge.o slaqp2.o slaqp2rk.o slaqp3rk.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \
    slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \
    slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \
    slarf.o  slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \
@@ -232,7 +232,7 @@ CLASRC_O = \
    cgbtf2.o cgbtrf.o cgbtrs.o cgebak.o cgebal.o cgebd2.o cgebrd.o \
    cgecon.o cgeequ.o cgees.o  cgeesx.o cgeev.o  cgeevx.o \
    cgehd2.o cgehrd.o cgelq2.o cgelqf.o \
-   cgels.o  cgelsd.o cgelss.o cgelsy.o cgeql2.o cgeqlf.o cgeqp3.o \
+   cgels.o  cgelsd.o cgelss.o cgelsy.o cgeql2.o cgeqlf.o cgeqp3.o cgeqp3rk.o \
    cgeqr2.o cgeqr2p.o cgeqrf.o cgeqrfp.o cgerfs.o \
    cgerq2.o cgerqf.o cgesc2.o cgesdd.o cgesv.o  cgesvd.o  cgesvdx.o \
    cgesvj.o cgejsv.o cgsvj0.o cgsvj1.o \
@@ -266,7 +266,7 @@ CLASRC_O = \
    clanhb.o clanhe.o \
    clanhp.o clanhs.o clanht.o clansb.o clansp.o clansy.o clantb.o \
    clantp.o clantr.o clapll.o clapmt.o clarcm.o claqgb.o claqge.o \
-   claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqsb.o \
+   claqhb.o claqhe.o claqhp.o claqp2.o claqp2rk.o claqp3rk.o claqps.o claqsb.o \
    claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \
    claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \
    claqz0.o claqz1.o claqz2.o claqz3.o \
@@ -345,7 +345,7 @@ DLASRC_O = \
    dgebrd.o dgecon.o dgeequ.o dgees.o  dgeesx.o dgeev.o  dgeevx.o \
    dgehd2.o dgehrd.o dgelq2.o dgelqf.o \
    dgels.o  dgelsd.o dgelss.o dgelsy.o dgeql2.o dgeqlf.o \
-   dgeqp3.o dgeqr2.o dgeqr2p.o dgeqrf.o dgeqrfp.o dgerfs.o \
+   dgeqp3.o dgeqp3rk.o dgeqr2.o dgeqr2p.o dgeqrf.o dgeqrfp.o dgerfs.o \
    dgerq2.o dgerqf.o dgesc2.o dgesdd.o dgesv.o  dgesvd.o dgesvdx.o dgesvx.o \
    dgetc2.o dgetf2.o dgetrf.o dgetri.o \
    dgetrs.o dggbak.o dggbal.o dgges.o  dgges3.o dggesx.o \
@@ -360,7 +360,7 @@ DLASRC_O = \
    dlangb.o dlange.o dlangt.o dlanhs.o dlansb.o dlansp.o \
    dlansy.o dlantb.o dlantp.o dlantr.o dlanv2.o \
    dlapll.o dlapmt.o \
-   dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \
+   dlaqgb.o dlaqge.o dlaqp2.o dlaqp2rk.o dlaqp3rk.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \
    dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \
    dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \
    dlarf.o  dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \
@@ -437,7 +437,7 @@ ZLASRC_O = \
    zgbtf2.o zgbtrf.o zgbtrs.o zgebak.o zgebal.o zgebd2.o zgebrd.o \
    zgecon.o zgeequ.o zgees.o  zgeesx.o zgeev.o  zgeevx.o \
    zgehd2.o zgehrd.o zgelq2.o zgelqf.o \
-   zgels.o  zgelsd.o zgelss.o zgelsy.o zgeql2.o zgeqlf.o zgeqp3.o \
+   zgels.o  zgelsd.o zgelss.o zgelsy.o zgeql2.o zgeqlf.o zgeqp3.o zgeqp3rk.o \
    zgeqr2.o zgeqr2p.o zgeqrf.o zgeqrfp.o zgerfs.o zgerq2.o zgerqf.o \
    zgesc2.o zgesdd.o zgesv.o  zgesvd.o zgesvdx.o \
    zgesvj.o zgejsv.o zgsvj0.o zgsvj1.o \
@@ -473,7 +473,7 @@ ZLASRC_O = \
    zlanhe.o \
    zlanhp.o zlanhs.o zlanht.o zlansb.o zlansp.o zlansy.o zlantb.o \
    zlantp.o zlantr.o zlapll.o zlapmt.o zlaqgb.o zlaqge.o \
-   zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqsb.o \
+   zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqp2rk.o zlaqp3rk.o zlaqps.o zlaqsb.o \
    zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \
    zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \
    zlaqz0.o zlaqz1.o zlaqz2.o zlaqz3.o \
diff --git a/lapack-netlib/SRC/cbbcsd.f b/lapack-netlib/SRC/cbbcsd.f
index 1cd1ffbf1..4d0c45efe 100644
--- a/lapack-netlib/SRC/cbbcsd.f
+++ b/lapack-netlib/SRC/cbbcsd.f
@@ -805,7 +805,7 @@
                CALL SLARTGP( B22BULGE, B22E(I-1), RWORK(IU2SN+I-1),
      $                       RWORK(IU2CS+I-1), R )
             ELSE IF( NU .LT. MU ) THEN
-               CALL SLARTGS( B21E(I), B21E(I+1), NU, RWORK(IU2CS+I-1),
+               CALL SLARTGS( B21E(I), B21D(I+1), NU, RWORK(IU2CS+I-1),
      $                       RWORK(IU2SN+I-1) )
             ELSE
                CALL SLARTGS( B22D(I), B22E(I), MU, RWORK(IU2CS+I-1),
diff --git a/lapack-netlib/SRC/cgebrd.f b/lapack-netlib/SRC/cgebrd.f
index 5687161a5..5920b1cf5 100644
--- a/lapack-netlib/SRC/cgebrd.f
+++ b/lapack-netlib/SRC/cgebrd.f
@@ -123,7 +123,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= max(1,M,N).
+*>          The length of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise.
 *>          For optimum performance LWORK >= (M+N)*NB, where NB
 *>          is the optimal blocksize.
 *>
@@ -148,7 +149,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup gebrd
 *
 *> \par Further Details:
 *  =====================
@@ -225,8 +226,8 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB,
-     $                   NBMIN, NX, WS
+      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT,
+     $                   MINMN, NB, NBMIN, NX, WS
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEBD2, CGEMM, CLABRD, XERBLA
@@ -236,16 +237,24 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input parameters
 *
       INFO = 0
-      NB = MAX( 1, ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) )
-      LWKOPT = ( M+N )*NB
-      WORK( 1 ) = REAL( LWKOPT )
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = MAX( M, N )
+         NB = MAX( 1, ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) )
+         LWKOPT = ( M+N )*NB
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -253,7 +262,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.LT.0 ) THEN
@@ -265,7 +274,6 @@
 *
 *     Quick return if possible
 *
-      MINMN = MIN( M, N )
       IF( MINMN.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -284,7 +292,7 @@
 *        Determine when to switch from blocked to unblocked code.
 *
          IF( NX.LT.MINMN ) THEN
-            WS = ( M+N )*NB
+            WS = LWKOPT
             IF( LWORK.LT.WS ) THEN
 *
 *              Not enough work space for the optimal NB, consider using
@@ -343,7 +351,7 @@
 *
       CALL CGEBD2( M-I+1, N-I+1, A( I, I ), LDA, D( I ), E( I ),
      $             TAUQ( I ), TAUP( I ), WORK, IINFO )
-      WORK( 1 ) = WS
+      WORK( 1 ) = SROUNDUP_LWORK( WS )
       RETURN
 *
 *     End of CGEBRD
diff --git a/lapack-netlib/SRC/cgedmd.f90 b/lapack-netlib/SRC/cgedmd.f90
index 499489270..1413130ec 100644
--- a/lapack-netlib/SRC/cgedmd.f90
+++ b/lapack-netlib/SRC/cgedmd.f90
@@ -1,22 +1,526 @@
+!> \brief \b CGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices.
+!
+!  =========== DOCUMENTATION ===========
+!
+!  Definition:
+!  ===========
+!
+!     SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,   &
+!                        M, N, X, LDX, Y, LDY, NRNK, TOL,   &
+!                        K, EIGS, Z, LDZ, RES, B,    LDB,   &
+!                        W, LDW,  S, LDS, ZWORK,  LZWORK,   &
+!                        RWORK, LRWORK, IWORK, LIWORK, INFO )
+!.....
+!     USE                   iso_fortran_env
+!     IMPLICIT NONE
+!     INTEGER, PARAMETER :: WP = real32
+!
+!.....
+!     Scalar arguments
+!     CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+!     INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+!                                NRNK, LDZ, LDB, LDW,  LDS, &
+!                                LIWORK, LRWORK, LZWORK
+!     INTEGER,       INTENT(OUT)  :: K, INFO
+!     REAL(KIND=WP), INTENT(IN)   ::    TOL
+!     Array arguments
+!     COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+!     COMPLEX(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+!                                        W(LDW,*), S(LDS,*)
+!     COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+!     COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+!     REAL(KIND=WP),    INTENT(OUT)   :: RES(*)
+!     REAL(KIND=WP),    INTENT(OUT)   :: RWORK(*)
+!     INTEGER,          INTENT(OUT)   :: IWORK(*)
+!
+!............................................................
+!>    \par Purpose:
+!     =============
+!>    \verbatim
+!>    CGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!>    a pair of data snapshot matrices. For the input matrices
+!>    X and Y such that Y = A*X with an unaccessible matrix
+!>    A, CGEDMD computes a certain number of Ritz pairs of A using
+!>    the standard Rayleigh-Ritz extraction from a subspace of
+!>    range(X) that is determined using the leading left singular
+!>    vectors of X. Optionally, CGEDMD returns the residuals
+!>    of the computed Ritz pairs, the information needed for
+!>    a refinement of the Ritz vectors, or the eigenvectors of
+!>    the Exact DMD.
+!>    For further details see the references listed
+!>    below. For more details of the implementation see [3].
+!>    \endverbatim
+!............................................................
+!>    \par References:
+!     ================
+!>    \verbatim
+!>    [1] P. Schmid: Dynamic mode decomposition of numerical
+!>        and experimental data,
+!>        Journal of Fluid Mechanics 656, 5-28, 2010.
+!>    [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!>        decompositions: analysis and enhancements,
+!>        SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!>    [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!>        Mode Decomposition I. Technical report. AIMDyn Inc.
+!>        and LAPACK Working Note 298.
+!>    [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!>        Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!>        Theory and Applications, Journal of Computational
+!>        Dynamics 1(2), 391 -421, 2014.
+!>    \endverbatim
+!......................................................................
+!>    \par Developed and supported by:
+!     ================================
+!>    \verbatim
+!>    Developed and coded by Zlatko Drmac, Faculty of Science,
+!>    University of Zagreb;  drmac@math.hr
+!>    In cooperation with
+!>    AIMdyn Inc., Santa Barbara, CA.
+!>    and supported by
+!>    - DARPA SBIR project "Koopman Operator-Based Forecasting
+!>    for Nonstationary Processes from Near-Term, Limited
+!>    Observational Data" Contract No: W31P4Q-21-C-0007
+!>    - DARPA PAI project "Physics-Informed Machine Learning
+!>    Methodologies" Contract No: HR0011-18-9-0033
+!>    - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!>    Framework for Space-Time Analysis of Process Dynamics"
+!>    Contract No: HR0011-16-C-0116
+!>    Any opinions, findings and conclusions or recommendations
+!>    expressed in this material are those of the author and
+!>    do not necessarily reflect the views of the DARPA SBIR
+!>    Program Office
+!>    \endverbatim
+!......................................................................
+!>    \par Distribution Statement A:
+!     ==============================
+!>    \verbatim
+!>    Approved for Public Release, Distribution Unlimited.
+!>    Cleared by DARPA on September 29, 2022
+!>    \endverbatim
+!......................................................................
+!     Arguments
+!     =========
+!
+!>    \param[in] JOBS
+!>    \verbatim
+!>    JOBS (input) CHARACTER*1
+!>    Determines whether the initial data snapshots are scaled
+!>    by a diagonal matrix.
+!>    'S' :: The data snapshots matrices X and Y are multiplied
+!>           with a diagonal matrix D so that X*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'C' :: The snapshots are scaled as with the 'S' option.
+!>           If it is found that an i-th column of X is zero
+!>           vector and the corresponding i-th column of Y is
+!>           non-zero, then the i-th column of Y is set to
+!>           zero and a warning flag is raised.
+!>    'Y' :: The data snapshots matrices X and Y are multiplied
+!>           by a diagonal matrix D so that Y*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'N' :: No data scaling.
+!>    \endverbatim
+!.....
+!>    \param[in] JOBZ
+!>    \verbatim
+!>    JOBZ (input) CHARACTER*1
+!>    Determines whether the eigenvectors (Koopman modes) will
+!>    be computed.
+!>    'V' :: The eigenvectors (Koopman modes) will be computed
+!>           and returned in the matrix Z.
+!>           See the description of Z.
+!>    'F' :: The eigenvectors (Koopman modes) will be returned
+!>           in factored form as the product X(:,1:K)*W, where X
+!>           contains a POD basis (leading left singular vectors
+!>           of the data matrix X) and W contains the eigenvectors
+!>           of the corresponding Rayleigh quotient.
+!>           See the descriptions of K, X, W, Z.
+!>    'N' :: The eigenvectors are not computed.
+!>    \endverbatim
+!.....
+!>    \param[in] JOBR
+!>    \verbatim
+!>    JOBR (input) CHARACTER*1
+!>    Determines whether to compute the residuals.
+!>    'R' :: The residuals for the computed eigenpairs will be
+!>           computed and stored in the array RES.
+!>           See the description of RES.
+!>           For this option to be legal, JOBZ must be 'V'.
+!>    'N' :: The residuals are not computed.
+!>    \endverbatim
+!.....
+!>    \param[in] JOBF
+!>    \verbatim
+!>    JOBF (input) CHARACTER*1
+!>    Specifies whether to store information needed for post-
+!>    processing (e.g. computing refined Ritz vectors)
+!>    'R' :: The matrix needed for the refinement of the Ritz
+!>           vectors is computed and stored in the array B.
+!>           See the description of B.
+!>    'E' :: The unscaled eigenvectors of the Exact DMD are
+!>           computed and returned in the array B. See the
+!>           description of B.
+!>    'N' :: No eigenvector refinement data is computed.
+!>    \endverbatim
+!.....
+!>    \param[in] WHTSVD
+!>    \verbatim
+!>    WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!>    Allows for a selection of the SVD algorithm from the
+!>    LAPACK library.
+!>    1 :: CGESVD (the QR SVD algorithm)
+!>    2 :: CGESDD (the Divide and Conquer algorithm; if enough
+!>         workspace available, this is the fastest option)
+!>    3 :: CGESVDQ (the preconditioned QR SVD  ; this and 4
+!>         are the most accurate options)
+!>    4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3
+!>         are the most accurate options)
+!>    For the four methods above, a significant difference in
+!>    the accuracy of small singular values is possible if
+!>    the snapshots vary in norm so that X is severely
+!>    ill-conditioned. If small (smaller than EPS*||X||)
+!>    singular values are of interest and JOBS=='N',  then
+!>    the options (3, 4) give the most accurate results, where
+!>    the option 4 is slightly better and with stronger
+!>    theoretical background.
+!>    If JOBS=='S', i.e. the columns of X will be normalized,
+!>    then all methods give nearly equally accurate results.
+!>    \endverbatim
+!.....
+!>    \param[in] M
+!>    \verbatim
+!>    M (input) INTEGER, M>= 0
+!>    The state space dimension (the row dimension of X, Y).
+!>    \endverbatim
+!.....
+!>    \param[in] N
+!>    \verbatim
+!>    N (input) INTEGER, 0 <= N <= M
+!>    The number of data snapshot pairs
+!>    (the number of columns of X and Y).
+!>    \endverbatim
+!.....
+!>    \param[in,out] X
+!>    \verbatim
+!>    X (input/output) COMPLEX(KIND=WP) M-by-N array
+!>    > On entry, X contains the data snapshot matrix X. It is
+!>    assumed that the column norms of X are in the range of
+!>    the normalized floating point numbers.
+!>    < On exit, the leading K columns of X contain a POD basis,
+!>    i.e. the leading K left singular vectors of the input
+!>    data matrix X, U(:,1:K). All N columns of X contain all
+!>    left singular vectors of the input matrix X.
+!>    See the descriptions of K, Z and W.
+!>    \endverbatim
+!.....
+!>    \param[in] LDX
+!>    \verbatim
+!>    LDX (input) INTEGER, LDX >= M
+!>    The leading dimension of the array X.
+!>    \endverbatim
+!.....
+!>    \param[in,out] Y
+!>    \verbatim
+!>    Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array
+!>    > On entry, Y contains the data snapshot matrix Y
+!>    < On exit,
+!>    If JOBR == 'R', the leading K columns of Y  contain
+!>    the residual vectors for the computed Ritz pairs.
+!>    See the description of RES.
+!>    If JOBR == 'N', Y contains the original input data,
+!>                    scaled according to the value of JOBS.
+!>    \endverbatim
+!.....
+!>    \param[in] LDY
+!>    \verbatim
+!>    LDY (input) INTEGER , LDY >= M
+!>    The leading dimension of the array Y.
+!>    \endverbatim
+!.....
+!>    \param[in] NRNK
+!>    \verbatim
+!>    NRNK (input) INTEGER
+!>    Determines the mode how to compute the numerical rank,
+!>    i.e. how to truncate small singular values of the input
+!>    matrix X. On input, if
+!>    NRNK = -1 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(1)
+!>                 This option is recommended.
+!>    NRNK = -2 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(i-1)
+!>                 This option is included for R&D purposes.
+!>                 It requires highly accurate SVD, which
+!>                 may not be feasible.
+!>    The numerical rank can be enforced by using positive
+!>    value of NRNK as follows:
+!>    0 < NRNK <= N :: at most NRNK largest singular values
+!>    will be used. If the number of the computed nonzero
+!>    singular values is less than NRNK, then only those
+!>    nonzero values will be used and the actually used
+!>    dimension is less than NRNK. The actual number of
+!>    the nonzero singular values is returned in the variable
+!>    K. See the descriptions of TOL and  K.
+!>    \endverbatim
+!.....
+!>    \param[in] TOL
+!>    \verbatim
+!>    TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!>    The tolerance for truncating small singular values.
+!>    See the description of NRNK.
+!>    \endverbatim
+!.....
+!>    \param[out] K
+!>    \verbatim
+!>    K (output) INTEGER,  0 <= K <= N
+!>    The dimension of the POD basis for the data snapshot
+!>    matrix X and the number of the computed Ritz pairs.
+!>    The value of K is determined according to the rule set
+!>    by the parameters NRNK and TOL.
+!>    See the descriptions of NRNK and TOL.
+!>    \endverbatim
+!.....
+!>    \param[out] EIGS
+!>    \verbatim
+!>    EIGS (output) COMPLEX(KIND=WP) N-by-1 array
+!>    The leading K (K<=N) entries of EIGS contain
+!>    the computed eigenvalues (Ritz values).
+!>    See the descriptions of K, and Z.
+!>    \endverbatim
+!.....
+!>    \param[out] Z
+!>    \verbatim
+!>    Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array
+!>    If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i)
+!>    is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
+!>    If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
+!>    the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i)
+!>    is an eigenvector corresponding to EIGS(i). The columns
+!>    of W(1:k,1:K) are the computed eigenvectors of the
+!>    K-by-K Rayleigh quotient.
+!>    See the descriptions of EIGS, X and W.
+!>    \endverbatim
+!.....
+!>    \param[in] LDZ
+!>    \verbatim
+!>    LDZ (input) INTEGER , LDZ >= M
+!>    The leading dimension of the array Z.
+!>    \endverbatim
+!.....
+!>    \param[out] RES
+!>    \verbatim
+!>    RES (output) REAL(KIND=WP) N-by-1 array
+!>    RES(1:K) contains the residuals for the K computed
+!>    Ritz pairs,
+!>    RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
+!>    See the description of EIGS and Z.
+!>    \endverbatim
+!.....
+!>    \param[out] B
+!>    \verbatim
+!>    B (output) COMPLEX(KIND=WP)  M-by-N array.
+!>    IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!>    be used for computing the refined vectors; see further
+!>    details in the provided references.
+!>    If JOBF == 'E', B(1:M,1:K) contains
+!>    A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!>    Exact DMD, up to scaling by the inverse eigenvalues.
+!>    If JOBF =='N', then B is not referenced.
+!>    See the descriptions of X, W, K.
+!>    \endverbatim
+!.....
+!>    \param[in] LDB
+!>    \verbatim
+!>    LDB (input) INTEGER, LDB >= M
+!>    The leading dimension of the array B.
+!>    \endverbatim
+!.....
+!>    \param[out] W
+!>    \verbatim
+!>    W (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!>    On exit, W(1:K,1:K) contains the K computed
+!>    eigenvectors of the matrix Rayleigh quotient.
+!>    The Ritz vectors (returned in Z) are the
+!>    product of X (containing a POD basis for the input
+!>    matrix X) and W. See the descriptions of K, S, X and Z.
+!>    W is also used as a workspace to temporarily store the
+!>    right singular vectors of X.
+!>    \endverbatim
+!.....
+!>    \param[in] LDW
+!>    \verbatim
+!>    LDW (input) INTEGER, LDW >= N
+!>    The leading dimension of the array W.
+!>    \endverbatim
+!.....
+!>    \param[out] S
+!>    \verbatim
+!>    S (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!>    The array S(1:K,1:K) is used for the matrix Rayleigh
+!>    quotient. This content is overwritten during
+!>    the eigenvalue decomposition by CGEEV.
+!>    See the description of K.
+!>    \endverbatim
+!.....
+!>    \param[in] LDS
+!>    \verbatim
+!>    LDS (input) INTEGER, LDS >= N
+!>    The leading dimension of the array S.
+!>    \endverbatim
+!.....
+!>    \param[out] ZWORK
+!>    \verbatim
+!>    ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array
+!>    ZWORK is used as complex workspace in the complex SVD, as
+!>    specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing
+!>    the eigenvalues of a Rayleigh quotient.
+!>    If the call to CGEDMD is only workspace query, then
+!>    ZWORK(1) contains the minimal complex workspace length and
+!>    ZWORK(2) is the optimal complex workspace length.
+!>    Hence, the length of work is at least 2.
+!>    See the description of LZWORK.
+!>    \endverbatim
+!.....
+!>    \param[in] LZWORK
+!>    \verbatim
+!>    LZWORK (input) INTEGER
+!>    The minimal length of the workspace vector ZWORK.
+!>    LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV),
+!>    where LZWORK_CGEEV = MAX( 1, 2*N )  and the minimal
+!>    LZWORK_SVD is calculated as follows
+!>    If WHTSVD == 1 :: CGESVD ::
+!>       LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+!>    If WHTSVD == 2 :: CGESDD ::
+!>       LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
+!>    If WHTSVD == 3 :: CGESVDQ ::
+!>       LZWORK_SVD = obtainable by a query
+!>    If WHTSVD == 4 :: CGEJSV ::
+!>       LZWORK_SVD = obtainable by a query
+!>    If on entry LZWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths and returns them in
+!>    LZWORK(1) and LZWORK(2), respectively.
+!>    \endverbatim
+!.....
+!>    \param[out] RWORK
+!>    \verbatim
+!>    RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array
+!>    On exit, RWORK(1:N) contains the singular values of
+!>    X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!>    If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain
+!>    scaling factor RWORK(N+2)/RWORK(N+1) used to scale X
+!>    and Y to avoid overflow in the SVD of X.
+!>    This may be of interest if the scaling option is off
+!>    and as many as possible smallest eigenvalues are
+!>    desired to the highest feasible accuracy.
+!>    If the call to CGEDMD is only workspace query, then
+!>    RWORK(1) contains the minimal workspace length.
+!>    See the description of LRWORK.
+!>    \endverbatim
+!.....
+!>    \param[in] LRWORK
+!>    \verbatim
+!>    LRWORK (input) INTEGER
+!>    The minimal length of the workspace vector RWORK.
+!>    LRWORK is calculated as follows:
+!>    LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where
+!>    LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace
+!>    for the SVD subroutine determined by the input parameter
+!>    WHTSVD.
+!>    If WHTSVD == 1 :: CGESVD ::
+!>       LRWORK_SVD = 5*MIN(M,N)
+!>    If WHTSVD == 2 :: CGESDD ::
+!>       LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N),
+!>       2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
+!>    If WHTSVD == 3 :: CGESVDQ ::
+!>       LRWORK_SVD = obtainable by a query
+!>    If WHTSVD == 4 :: CGEJSV ::
+!>       LRWORK_SVD = obtainable by a query
+!>    If on entry LRWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    real workspace length and returns it in RWORK(1).
+!>    \endverbatim
+!.....
+!>    \param[out] IWORK
+!>    \verbatim
+!>    IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!>    Workspace that is required only if WHTSVD equals
+!>    2 , 3 or 4. (See the description of WHTSVD).
+!>    If on entry LWORK =-1 or LIWORK=-1, then the
+!>    minimal length of IWORK is computed and returned in
+!>    IWORK(1). See the description of LIWORK.
+!>    \endverbatim
+!.....
+!>    \param[in] LIWORK
+!>    \verbatim
+!>    LIWORK (input) INTEGER
+!>    The minimal length of the workspace vector IWORK.
+!>    If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!>    If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!>    If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!>    If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!>    If on entry LIWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths for  ZWORK, RWORK and
+!>    IWORK. See the descriptions of ZWORK, RWORK and IWORK.
+!>    \endverbatim
+!.....
+!>    \param[out] INFO
+!>    \verbatim
+!>    INFO (output) INTEGER
+!>    -i < 0 :: On entry, the i-th argument had an
+!>              illegal value
+!>       = 0 :: Successful return.
+!>       = 1 :: Void input. Quick exit (M=0 or N=0).
+!>       = 2 :: The SVD computation of X did not converge.
+!>              Suggestion: Check the input data and/or
+!>              repeat with different WHTSVD.
+!>       = 3 :: The computation of the eigenvalues did not
+!>              converge.
+!>       = 4 :: If data scaling was requested on input and
+!>              the procedure found inconsistency in the data
+!>              such that for some column index i,
+!>              X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!>              to zero if JOBS=='C'. The computation proceeds
+!>              with original or modified data and warning
+!>              flag is set with INFO=4.
+!>    \endverbatim
+!
+!  Authors:
+!  ========
+!
+!> \author Zlatko Drmac
+!
+!> \ingroup gedmd
+!
+!.............................................................
+!.............................................................
       SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,   &
                          M, N, X, LDX, Y, LDY, NRNK, TOL,   &
                          K, EIGS, Z, LDZ, RES, B,    LDB,   &
                          W, LDW,  S, LDS, ZWORK,  LZWORK,   &
                          RWORK, LRWORK, IWORK, LIWORK, INFO )
-!   March 2023
+!
+!  -- LAPACK driver routine                                           --
+!
+!  -- LAPACK is a software package provided by University of          --
+!  -- Tennessee, University of California Berkeley, University of     --
+!  -- Colorado Denver and NAG Ltd..                                   --
+!
 !.....
       USE                   iso_fortran_env
       IMPLICIT NONE
       INTEGER, PARAMETER :: WP = real32
-!.....
+!
 !     Scalar arguments
+!     ~~~~~~~~~~~~~~~~
       CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
       INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
                                  NRNK, LDZ, LDB, LDW,  LDS, &
                                  LIWORK, LRWORK, LZWORK
       INTEGER,       INTENT(OUT)  :: K, INFO
       REAL(KIND=WP), INTENT(IN)   ::    TOL
+!
 !     Array arguments
+!     ~~~~~~~~~~~~~~~
       COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
       COMPLEX(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
                                          W(LDW,*), S(LDS,*)
@@ -25,364 +529,14 @@
       REAL(KIND=WP),    INTENT(OUT)   :: RES(*)
       REAL(KIND=WP),    INTENT(OUT)   :: RWORK(*)
       INTEGER,          INTENT(OUT)   :: IWORK(*)
-!............................................................
-!     Purpose
-!     =======
-!     CGEDMD computes the Dynamic Mode Decomposition (DMD) for
-!     a pair of data snapshot matrices. For the input matrices
-!     X and Y such that Y = A*X with an unaccessible matrix
-!     A, CGEDMD computes a certain number of Ritz pairs of A using
-!     the standard Rayleigh-Ritz extraction from a subspace of
-!     range(X) that is determined using the leading left singular
-!     vectors of X. Optionally, CGEDMD returns the residuals
-!     of the computed Ritz pairs, the information needed for
-!     a refinement of the Ritz vectors, or the eigenvectors of
-!     the Exact DMD.
-!     For further details see the references listed
-!     below. For more details of the implementation see [3].
-!
-!     References
-!     ==========
-!     [1] P. Schmid: Dynamic mode decomposition of numerical
-!         and experimental data,
-!         Journal of Fluid Mechanics 656, 5-28, 2010.
-!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
-!         decompositions: analysis and enhancements,
-!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
-!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
-!         Mode Decomposition I. Technical report. AIMDyn Inc.
-!         and LAPACK Working Note 298.
-!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
-!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
-!         Theory and Applications, Journal of Computational
-!         Dynamics 1(2), 391 -421, 2014.
 !
-!......................................................................
-!     Developed and supported by:
-!     ===========================
-!     Developed and coded by Zlatko Drmac, Faculty of Science,
-!     University of Zagreb;  drmac@math.hr
-!     In cooperation with
-!     AIMdyn Inc., Santa Barbara, CA.
-!     and supported by
-!     - DARPA SBIR project "Koopman Operator-Based Forecasting
-!     for Nonstationary Processes from Near-Term, Limited
-!     Observational Data" Contract No: W31P4Q-21-C-0007
-!     - DARPA PAI project "Physics-Informed Machine Learning
-!     Methodologies" Contract No: HR0011-18-9-0033
-!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
-!     Framework for Space-Time Analysis of Process Dynamics"
-!     Contract No: HR0011-16-C-0116
-!     Any opinions, findings and conclusions or recommendations
-!     expressed in this material are those of the author and
-!     do not necessarily reflect the views of the DARPA SBIR
-!     Program Office
-!============================================================
-!     Distribution Statement A:
-!     Approved for Public Release, Distribution Unlimited.
-!     Cleared by DARPA on September 29, 2022
-!============================================================
-!......................................................................
-!     Arguments
-!     =========
-!     JOBS (input) CHARACTER*1
-!     Determines whether the initial data snapshots are scaled
-!     by a diagonal matrix.
-!     'S' :: The data snapshots matrices X and Y are multiplied
-!            with a diagonal matrix D so that X*D has unit
-!            nonzero columns (in the Euclidean 2-norm)
-!     'C' :: The snapshots are scaled as with the 'S' option.
-!            If it is found that an i-th column of X is zero
-!            vector and the corresponding i-th column of Y is
-!            non-zero, then the i-th column of Y is set to
-!            zero and a warning flag is raised.
-!     'Y' :: The data snapshots matrices X and Y are multiplied
-!            by a diagonal matrix D so that Y*D has unit
-!            nonzero columns (in the Euclidean 2-norm)
-!     'N' :: No data scaling.
-!.....
-!     JOBZ (input) CHARACTER*1
-!     Determines whether the eigenvectors (Koopman modes) will
-!     be computed.
-!     'V' :: The eigenvectors (Koopman modes) will be computed
-!            and returned in the matrix Z.
-!            See the description of Z.
-!     'F' :: The eigenvectors (Koopman modes) will be returned
-!            in factored form as the product X(:,1:K)*W, where X
-!            contains a POD basis (leading left singular vectors
-!            of the data matrix X) and W contains the eigenvectors
-!            of the corresponding Rayleigh quotient.
-!            See the descriptions of K, X, W, Z.
-!     'N' :: The eigenvectors are not computed.
-!.....
-!     JOBR (input) CHARACTER*1
-!     Determines whether to compute the residuals.
-!     'R' :: The residuals for the computed eigenpairs will be
-!            computed and stored in the array RES.
-!            See the description of RES.
-!            For this option to be legal, JOBZ must be 'V'.
-!     'N' :: The residuals are not computed.
-!.....
-!     JOBF (input) CHARACTER*1
-!     Specifies whether to store information needed for post-
-!     processing (e.g. computing refined Ritz vectors)
-!     'R' :: The matrix needed for the refinement of the Ritz
-!            vectors is computed and stored in the array B.
-!            See the description of B.
-!     'E' :: The unscaled eigenvectors of the Exact DMD are
-!            computed and returned in the array B. See the
-!            description of B.
-!     'N' :: No eigenvector refinement data is computed.
-!.....
-!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
-!     Allows for a selection of the SVD algorithm from the
-!     LAPACK library.
-!     1 :: CGESVD (the QR SVD algorithm)
-!     2 :: CGESDD (the Divide and Conquer algorithm; if enough
-!          workspace available, this is the fastest option)
-!     3 :: CGESVDQ (the preconditioned QR SVD  ; this and 4
-!          are the most accurate options)
-!     4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3
-!          are the most accurate options)
-!     For the four methods above, a significant difference in
-!     the accuracy of small singular values is possible if
-!     the snapshots vary in norm so that X is severely
-!     ill-conditioned. If small (smaller than EPS*||X||)
-!     singular values are of interest and JOBS=='N',  then
-!     the options (3, 4) give the most accurate results, where
-!     the option 4 is slightly better and with stronger
-!     theoretical background.
-!     If JOBS=='S', i.e. the columns of X will be normalized,
-!     then all methods give nearly equally accurate results.
-!.....
-!     M (input) INTEGER, M>= 0
-!     The state space dimension (the row dimension of X, Y).
-!.....
-!     N (input) INTEGER, 0 <= N <= M
-!     The number of data snapshot pairs
-!     (the number of columns of X and Y).
-!.....
-!     X (input/output) COMPLEX(KIND=WP) M-by-N array
-!   > On entry, X contains the data snapshot matrix X. It is
-!     assumed that the column norms of X are in the range of
-!     the normalized floating point numbers.
-!   < On exit, the leading K columns of X contain a POD basis,
-!     i.e. the leading K left singular vectors of the input
-!     data matrix X, U(:,1:K). All N columns of X contain all
-!     left singular vectors of the input matrix X.
-!     See the descriptions of K, Z and W.
-!.....
-!     LDX (input) INTEGER, LDX >= M
-!     The leading dimension of the array X.
-!.....
-!     Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array
-!   > On entry, Y contains the data snapshot matrix Y
-!   < On exit,
-!     If JOBR == 'R', the leading K columns of Y  contain
-!     the residual vectors for the computed Ritz pairs.
-!     See the description of RES.
-!     If JOBR == 'N', Y contains the original input data,
-!                     scaled according to the value of JOBS.
-!.....
-!     LDY (input) INTEGER , LDY >= M
-!     The leading dimension of the array Y.
-!.....
-!     NRNK (input) INTEGER
-!     Determines the mode how to compute the numerical rank,
-!     i.e. how to truncate small singular values of the input
-!     matrix X. On input, if
-!     NRNK = -1 :: i-th singular value sigma(i) is truncated
-!                  if sigma(i) <= TOL*sigma(1)
-!                  This option is recommended.
-!     NRNK = -2 :: i-th singular value sigma(i) is truncated
-!                  if sigma(i) <= TOL*sigma(i-1)
-!                  This option is included for R&D purposes.
-!                  It requires highly accurate SVD, which
-!                  may not be feasible.
-!     The numerical rank can be enforced by using positive
-!     value of NRNK as follows:
-!     0 < NRNK <= N :: at most NRNK largest singular values
-!     will be used. If the number of the computed nonzero
-!     singular values is less than NRNK, then only those
-!     nonzero values will be used and the actually used
-!     dimension is less than NRNK. The actual number of
-!     the nonzero singular values is returned in the variable
-!     K. See the descriptions of TOL and  K.
-!.....
-!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
-!     The tolerance for truncating small singular values.
-!     See the description of NRNK.
-!.....
-!     K (output) INTEGER,  0 <= K <= N
-!     The dimension of the POD basis for the data snapshot
-!     matrix X and the number of the computed Ritz pairs.
-!     The value of K is determined according to the rule set
-!     by the parameters NRNK and TOL.
-!     See the descriptions of NRNK and TOL.
-!.....
-!     EIGS (output) COMPLEX(KIND=WP) N-by-1 array
-!     The leading K (K<=N) entries of EIGS contain
-!     the computed eigenvalues (Ritz values).
-!     See the descriptions of K, and Z.
-!.....
-!     Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array
-!     If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i)
-!     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
-!     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
-!     the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i)
-!     is an eigenvector corresponding to EIGS(i). The columns
-!     of W(1:k,1:K) are the computed eigenvectors of the
-!     K-by-K Rayleigh quotient.
-!     See the descriptions of EIGS, X and W.
-!.....
-!     LDZ (input) INTEGER , LDZ >= M
-!     The leading dimension of the array Z.
-!.....
-!     RES (output) REAL(KIND=WP) N-by-1 array
-!     RES(1:K) contains the residuals for the K computed
-!     Ritz pairs,
-!     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
-!     See the description of EIGS and Z.
-!.....
-!     B (output) COMPLEX(KIND=WP)  M-by-N array.
-!     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
-!     be used for computing the refined vectors; see further
-!     details in the provided references.
-!     If JOBF == 'E', B(1:M,1:K) contains
-!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
-!     Exact DMD, up to scaling by the inverse eigenvalues.
-!     If JOBF =='N', then B is not referenced.
-!     See the descriptions of X, W, K.
-!.....
-!     LDB (input) INTEGER, LDB >= M
-!     The leading dimension of the array B.
-!.....
-!     W (workspace/output) COMPLEX(KIND=WP) N-by-N array
-!     On exit, W(1:K,1:K) contains the K computed
-!     eigenvectors of the matrix Rayleigh quotient.
-!     The Ritz vectors (returned in Z) are the
-!     product of X (containing a POD basis for the input
-!     matrix X) and W. See the descriptions of K, S, X and Z.
-!     W is also used as a workspace to temporarily store the
-!     right singular vectors of X.
-!.....
-!     LDW (input) INTEGER, LDW >= N
-!     The leading dimension of the array W.
-!.....
-!     S (workspace/output) COMPLEX(KIND=WP) N-by-N array
-!     The array S(1:K,1:K) is used for the matrix Rayleigh
-!     quotient. This content is overwritten during
-!     the eigenvalue decomposition by CGEEV.
-!     See the description of K.
-!.....
-!     LDS (input) INTEGER, LDS >= N
-!     The leading dimension of the array S.
-!.....
-!     ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array
-!     ZWORK is used as complex workspace in the complex SVD, as
-!     specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing
-!     the eigenvalues of a Rayleigh quotient.
-!     If the call to CGEDMD is only workspace query, then
-!     ZWORK(1) contains the minimal complex workspace length and
-!     ZWORK(2) is the optimal complex workspace length.
-!     Hence, the length of work is at least 2.
-!     See the description of LZWORK.
-!.....
-!     LZWORK (input) INTEGER
-!     The minimal length of the workspace vector ZWORK.
-!     LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV),
-!     where LZWORK_CGEEV = MAX( 1, 2*N )  and the minimal
-!     LZWORK_SVD is calculated as follows
-!     If WHTSVD == 1 :: CGESVD ::
-!        LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N))
-!     If WHTSVD == 2 :: CGESDD ::
-!        LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
-!     If WHTSVD == 3 :: CGESVDQ ::
-!        LZWORK_SVD = obtainable by a query
-!     If WHTSVD == 4 :: CGEJSV ::
-!        LZWORK_SVD = obtainable by a query
-!     If on entry LZWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     and the optimal workspace lengths and returns them in
-!     LZWORK(1) and LZWORK(2), respectively.
-!.....
-!     RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array
-!     On exit, RWORK(1:N) contains the singular values of
-!     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
-!     If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain
-!     scaling factor RWORK(N+2)/RWORK(N+1) used to scale X
-!     and Y to avoid overflow in the SVD of X.
-!     This may be of interest if the scaling option is off
-!     and as many as possible smallest eigenvalues are
-!     desired to the highest feasible accuracy.
-!     If the call to CGEDMD is only workspace query, then
-!     RWORK(1) contains the minimal workspace length.
-!     See the description of LRWORK.
-!.....
-!     LRWORK (input) INTEGER
-!     The minimal length of the workspace vector RWORK.
-!     LRWORK is calculated as follows:
-!     LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where
-!     LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace
-!     for the SVD subroutine determined by the input parameter
-!     WHTSVD.
-!     If WHTSVD == 1 :: CGESVD ::
-!        LRWORK_SVD = 5*MIN(M,N)
-!     If WHTSVD == 2 :: CGESDD ::
-!        LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N),
-!        2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
-!     If WHTSVD == 3 :: CGESVDQ ::
-!        LRWORK_SVD = obtainable by a query
-!     If WHTSVD == 4 :: CGEJSV ::
-!        LRWORK_SVD = obtainable by a query
-!     If on entry LRWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     real workspace length and returns it in RWORK(1).
-!.....
-!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
-!     Workspace that is required only if WHTSVD equals
-!     2 , 3 or 4. (See the description of WHTSVD).
-!     If on entry LWORK =-1 or LIWORK=-1, then the
-!     minimal length of IWORK is computed and returned in
-!     IWORK(1). See the description of LIWORK.
-!.....
-!     LIWORK (input) INTEGER
-!     The minimal length of the workspace vector IWORK.
-!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
-!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
-!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
-!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
-!     If on entry LIWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     and the optimal workspace lengths for  ZWORK, RWORK and
-!     IWORK. See the descriptions of ZWORK, RWORK and IWORK.
-!.....
-!     INFO (output) INTEGER
-!     -i < 0 :: On entry, the i-th argument had an
-!               illegal value
-!        = 0 :: Successful return.
-!        = 1 :: Void input. Quick exit (M=0 or N=0).
-!        = 2 :: The SVD computation of X did not converge.
-!               Suggestion: Check the input data and/or
-!               repeat with different WHTSVD.
-!        = 3 :: The computation of the eigenvalues did not
-!               converge.
-!        = 4 :: If data scaling was requested on input and
-!               the procedure found inconsistency in the data
-!               such that for some column index i,
-!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
-!               to zero if JOBS=='C'. The computation proceeds
-!               with original or modified data and warning
-!               flag is set with INFO=4.
-!.............................................................
-!.............................................................
 !     Parameters
 !     ~~~~~~~~~~
       REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
       REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
       COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
       COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
-
+!
 !     Local scalars
 !     ~~~~~~~~~~~~~
       REAL(KIND=WP) :: OFL,   ROOTSC, SCALE,  SMALL,   &
@@ -400,7 +554,7 @@
 !     Local arrays
 !     ~~~~~~~~~~~~
       REAL(KIND=WP) :: RDUMMY(2)
-
+!
 !     External functions (BLAS and LAPACK)
 !     ~~~~~~~~~~~~~~~~~
       REAL(KIND=WP) CLANGE, SLAMCH, SCNRM2
@@ -408,13 +562,13 @@
       INTEGER                               ICAMAX
       LOGICAL       SISNAN, LSAME
       EXTERNAL      SISNAN, LSAME
-
+!
 !     External subroutines (BLAS and LAPACK)
 !     ~~~~~~~~~~~~~~~~~~~~
       EXTERNAL      CAXPY,  CGEMM,  CSSCAL
       EXTERNAL      CGEEV,  CGEJSV, CGESDD, CGESVD, CGESVDQ, &
                     CLACPY, CLASCL, CLASSQ, XERBLA
-
+!
 !     Intrinsic functions
 !     ~~~~~~~~~~~~~~~~~~~
       INTRINSIC     FLOAT, INT, MAX, SQRT
@@ -607,7 +761,8 @@
           K = 0
           DO i = 1, N
             !WORK(i) = SCNRM2( M, X(1,i), 1 )
-            SCALE  = ZERO
+            SSUM  = ONE
+            SCALE = ZERO
             CALL CLASSQ( M, X(1,i), 1, SCALE, SSUM )
             IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
                 K    =  0
@@ -680,7 +835,8 @@
           ! carefully computed using CLASSQ.
           DO i = 1, N
             !RWORK(i) = SCNRM2( M, Y(1,i), 1 )
-            SCALE  = ZERO
+            SSUM  = ONE
+            SCALE = ZERO
             CALL CLASSQ( M, Y(1,i), 1, SCALE, SSUM )
             IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
                 K    =  0
diff --git a/lapack-netlib/SRC/cgehrd.f b/lapack-netlib/SRC/cgehrd.f
index f407f931a..7ba87cc01 100644
--- a/lapack-netlib/SRC/cgehrd.f
+++ b/lapack-netlib/SRC/cgehrd.f
@@ -89,7 +89,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension (LWORK)
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -222,13 +222,19 @@
          INFO = -8
       END IF
 *
+      NH = IHI - ILO + 1
       IF( INFO.EQ.0 ) THEN
 *
 *        Compute the workspace requirements
 *
-         NB = MIN( NBMAX, ILAENV( 1, 'CGEHRD', ' ', N, ILO, IHI, -1 ) )
-         LWKOPT = N*NB + TSIZE
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         IF( NH.LE.1 ) THEN
+            LWKOPT = 1
+         ELSE
+            NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI,
+     $                              -1 ) )
+            LWKOPT = N*NB + TSIZE
+         END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -249,7 +255,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -269,7 +274,7 @@
 *
 *           Determine if workspace is large enough for blocked code
 *
-            IF( LWORK.LT.N*NB+TSIZE ) THEN
+            IF( LWORK.LT.LWKOPT ) THEN
 *
 *              Not enough workspace to use optimal NB:  determine the
 *              minimum value of NB, and reduce NB or force use of
@@ -345,7 +350,8 @@
 *     Use unblocked code to reduce the rest of the matrix
 *
       CALL CGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO )
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgelq.f b/lapack-netlib/SRC/cgelq.f
index ff482bc42..24aaa982e 100644
--- a/lapack-netlib/SRC/cgelq.f
+++ b/lapack-netlib/SRC/cgelq.f
@@ -98,7 +98,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -295,9 +295,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+          WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
         ELSE
-          WORK( 1 ) = SROUNDUP_LWORK(LWREQ)
+          WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -322,7 +322,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWREQ)
+      WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgelqf.f b/lapack-netlib/SRC/cgelqf.f
index 75f5bc960..3847a958a 100644
--- a/lapack-netlib/SRC/cgelqf.f
+++ b/lapack-netlib/SRC/cgelqf.f
@@ -93,7 +93,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,M).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise.
 *>          For optimum performance LWORK >= M*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -175,9 +176,8 @@
 *     Test the input arguments
 *
       INFO = 0
+      K = MIN( M, N )
       NB = ILAENV( 1, 'CGELQF', ' ', M, N, -1, -1 )
-      LWKOPT = M*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -185,19 +185,25 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN
-         INFO = -7
+      ELSE IF( .NOT.LQUERY ) THEN
+         IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
+     $      INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CGELQF', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
+         IF( K.EQ.0 ) THEN
+            LWKOPT = 1
+         ELSE
+            LWKOPT = M*NB
+         END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -267,7 +273,7 @@
      $   CALL CGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = SROUNDUP_LWORK(IWS)
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of CGELQF
diff --git a/lapack-netlib/SRC/cgemlq.f b/lapack-netlib/SRC/cgemlq.f
index e0cf78bc0..e5b02b669 100644
--- a/lapack-netlib/SRC/cgemlq.f
+++ b/lapack-netlib/SRC/cgemlq.f
@@ -110,16 +110,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -143,7 +144,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -159,11 +160,13 @@
 *>  block sizes MB and NB returned by ILAENV, CGELQ will use either
 *>  CLASWLQ (if the matrix is wide-and-short) or CGELQT to compute
 *>  the LQ factorization.
-*>  This version of CGEMLQ will use either CLAMSWLQ or CGEMLQT to 
+*>  This version of CGEMLQ will use either CLAMSWLQ or CGEMLQT to
 *>  multiply matrix Q by another matrix.
 *>  Further Details in CLAMSWLQ or CGEMLQT.
 *> \endverbatim
 *>
+*> \ingroup gemlq
+*>
 *  =====================================================================
       SUBROUTINE CGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -185,11 +188,12 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLAMSWLQ, CGEMLQT, XERBLA
@@ -201,7 +205,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -216,6 +220,13 @@
         LW = M * MB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN
@@ -244,12 +255,12 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( ( LWORK.LT.LWMIN ) .AND. ( .NOT.LQUERY ) ) THEN
         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = REAL( LW )
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -261,7 +272,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -274,7 +285,7 @@
      $                 MB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = REAL( LW )
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgemqr.f b/lapack-netlib/SRC/cgemqr.f
index ea9de146e..0b7dd9dd7 100644
--- a/lapack-netlib/SRC/cgemqr.f
+++ b/lapack-netlib/SRC/cgemqr.f
@@ -111,16 +111,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -144,7 +145,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -166,6 +167,8 @@
 *>
 *> \endverbatim
 *>
+*> \ingroup gemqr
+*>
 *  =====================================================================
       SUBROUTINE CGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -187,11 +190,12 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEMQRT, CLAMTSQR, XERBLA
@@ -203,7 +207,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -218,6 +222,13 @@
         LW = MB * NB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, MB - K ).EQ.0 ) THEN
@@ -251,7 +262,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = LW
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -263,7 +274,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -276,7 +287,7 @@
      $                 NB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = LW
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgeqlf.f b/lapack-netlib/SRC/cgeqlf.f
index 918bbddad..6c67344c5 100644
--- a/lapack-netlib/SRC/cgeqlf.f
+++ b/lapack-netlib/SRC/cgeqlf.f
@@ -88,7 +88,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is
 *>          the optimal blocksize.
 *>
@@ -187,10 +188,11 @@
             NB = ILAENV( 1, 'CGEQLF', ' ', M, N, -1, -1 )
             LWKOPT = N*NB
          END IF
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
-         IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
-            INFO = -7
+         IF( .NOT.LQUERY ) THEN
+            IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) )
+     $         INFO = -7
          END IF
       END IF
 *
@@ -277,7 +279,7 @@
       IF( MU.GT.0 .AND. NU.GT.0 )
      $   CALL CGEQL2( MU, NU, A, LDA, TAU, WORK, IINFO )
 *
-      WORK( 1 ) = SROUNDUP_LWORK(IWS)
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of CGEQLF
diff --git a/lapack-netlib/SRC/cgeqp3rk.c b/lapack-netlib/SRC/cgeqp3rk.c
new file mode 100644
index 000000000..54e7fb140
--- /dev/null
+++ b/lapack-netlib/SRC/cgeqp3rk.c
@@ -0,0 +1,1071 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static integer c__3 = 3;
+static integer c__2 = 2;
+
+/* Subroutine */ int cgeqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*kmax, real *abstol, real *reltol, complex *a, integer *lda, integer *
+	k, real *maxc2nrmk, real *relmaxc2nrmk, integer *jpiv, complex *tau, 
+	complex *work, integer *lwork, real *rwork, integer *iwork, integer *
+	info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    real r__1, r__2;
+    complex q__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int claqp3rk_(integer *, integer *, integer *, 
+	    integer *, integer *, real *, real *, integer *, real *, complex *
+	    , integer *, logical *, integer *, real *, real *, integer *, 
+	    complex *, real *, real *, complex *, complex *, integer *, 
+	    integer *, integer *);
+    real maxc2nrm;
+    logical done;
+    integer jmax, j, jmaxc2nrm, jmaxb, nbmin, iinfo, n_sub__, minmn;
+    real myhugeval;
+    extern real scnrm2_(integer *, complex *, integer *);
+    integer jb, nb, kf, nx;
+    extern real slamch_(char *);
+    real safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen), isamax_(integer *, real *, 
+	    integer *);
+    extern logical sisnan_(real *);
+    integer kp1, lwkopt;
+    logical lquery;
+    integer jbf;
+    real eps;
+    integer iws, ioffset;
+    extern /* Subroutine */ int claqp2rk_(integer *, integer *, integer *, 
+	    integer *, integer *, real *, real *, integer *, real *, complex *
+	    , integer *, integer *, real *, real *, integer *, complex *, 
+	    real *, real *, complex *, integer *);
+
+
+/*  -- LAPACK computational routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test input arguments */
+/*     ==================== */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*kmax < 0) {
+	*info = -4;
+    } else if (sisnan_(abstol)) {
+	*info = -5;
+    } else if (sisnan_(reltol)) {
+	*info = -6;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -8;
+    }
+
+/*     If the input parameters M, N, NRHS, KMAX, LDA are valid: */
+/*       a) Test the input workspace size LWORK for the minimum */
+/*          size requirement IWS. */
+/*       b) Determine the optimal block size NB and optimal */
+/*          workspace size LWKOPT to be returned in WORK(1) */
+/*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., */
+/*          (3) when routine exits. */
+/*     Here, IWS is the miminum workspace required for unblocked */
+/*     code. */
+
+    if (*info == 0) {
+	minmn = f2cmin(*m,*n);
+	if (minmn == 0) {
+	    iws = 1;
+	    lwkopt = 1;
+	} else {
+
+/*           Minimal workspace size in case of using only unblocked */
+/*           BLAS 2 code in CLAQP2RK. */
+/*           1) CLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in CLARF subroutine inside CLAQP2RK to apply an */
+/*              elementary reflector from the left. */
+/*           TOTAL_WORK_SIZE = 3*N + NRHS - 1 */
+
+	    iws = *n + *nrhs - 1;
+
+/*           Assign to NB optimal block size. */
+
+	    nb = ilaenv_(&c__1, "CGEQP3RK", " ", m, n, &c_n1, &c_n1, (ftnlen)
+		    8, (ftnlen)1);
+
+/*           A formula for the optimal workspace size in case of using */
+/*           both unblocked BLAS 2 in CLAQP2RK and blocked BLAS 3 code */
+/*           in CLAQP3RK. */
+/*           1) CGEQP3RK, CLAQP2RK, CLAQP3RK: 2*N to store full and */
+/*              partial column 2-norms. */
+/*           2) CLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in CLARF subroutine to apply an elementary reflector */
+/*              from the left. */
+/*           3) CLAQP3RK: NB*(N+NRHS) to use in the work array F that */
+/*              is used to apply a block reflector from */
+/*              the left. */
+/*           4) CLAQP3RK: NB to use in the auxilixary array AUX. */
+/*           Sizes (2) and ((3) + (4)) should intersect, therefore */
+/*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. */
+
+	    lwkopt = (*n << 1) + nb * (*n + *nrhs + 1);
+	}
+	q__1.r = (real) lwkopt, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+
+	if (*lwork < iws && ! lquery) {
+	    *info = -15;
+	}
+    }
+
+/*      NOTE: The optimal workspace size is returned in WORK(1), if */
+/*            the input parameters M, N, NRHS, KMAX, LDA are valid. */
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEQP3RK", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible for M=0 or N=0. */
+
+    if (minmn == 0) {
+	*k = 0;
+	*maxc2nrmk = 0.f;
+	*relmaxc2nrmk = 0.f;
+	q__1.r = (real) lwkopt, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize column pivot array JPIV. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	jpiv[j] = j;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize storage for partial and exact column 2-norms. */
+/*     a) The elements WORK(1:N) are used to store partial column */
+/*        2-norms of the matrix A, and may decrease in each computation */
+/*        step; initialize to the values of complete columns 2-norms. */
+/*     b) The elements WORK(N+1:2*N) are used to store complete column */
+/*        2-norms of the matrix A, they are not changed during the */
+/*        computation; initialize the values of complete columns 2-norms. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	rwork[j] = scnrm2_(m, &a[j * a_dim1 + 1], &c__1);
+	rwork[*n + j] = rwork[j];
+    }
+
+/*     ================================================================== */
+
+/*     Compute the pivot column index and the maximum column 2-norm */
+/*     for the whole original matrix stored in A(1:M,1:N). */
+
+    kp1 = isamax_(n, &rwork[1], &c__1);
+
+/*     ==================================================================. */
+
+    if (sisnan_(&maxc2nrm)) {
+
+/*        Check if the matrix A contains NaN, set INFO parameter */
+/*        to the column number where the first NaN is found and return */
+/*        from the routine. */
+
+	*k = 0;
+	*info = kp1;
+
+/*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = maxc2nrm;
+
+/*        Array TAU is not set and contains undefined elements. */
+
+	q__1.r = (real) lwkopt, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+	return 0;
+    }
+
+/*     =================================================================== */
+
+    if (maxc2nrm == 0.f) {
+
+/*        Check is the matrix A is a zero matrix, set array TAU and */
+/*        return from the routine. */
+
+	*k = 0;
+	*maxc2nrmk = 0.f;
+	*relmaxc2nrmk = 0.f;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j;
+	    tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+	}
+
+	q__1.r = (real) lwkopt, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+	return 0;
+
+    }
+
+/*     =================================================================== */
+
+    myhugeval = slamch_("Overflow");
+
+    if (maxc2nrm > myhugeval) {
+
+/*        Check if the matrix A contains +Inf or -Inf, set INFO parameter */
+/*        to the column number, where the first +/-Inf  is found plus N, */
+/*        and continue the computation. */
+
+	*info = *n + kp1;
+
+    }
+
+/*     ================================================================== */
+
+/*     Quick return if possible for the case when the first */
+/*     stopping criterion is satisfied, i.e. KMAX = 0. */
+
+    if (*kmax == 0) {
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.f;
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j;
+	    tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+	}
+	q__1.r = (real) lwkopt, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+    eps = slamch_("Epsilon");
+
+/*     Adjust ABSTOL */
+
+    if (*abstol >= 0.f) {
+	safmin = slamch_("Safe minimum");
+/* Computing MAX */
+	r__1 = *abstol, r__2 = safmin * 2.f;
+	*abstol = f2cmax(r__1,r__2);
+    }
+
+/*     Adjust RELTOL */
+
+    if (*reltol >= 0.f) {
+	*reltol = f2cmax(*reltol,eps);
+    }
+
+/*     =================================================================== */
+
+/*     JMAX is the maximum index of the column to be factorized, */
+/*     which is also limited by the first stopping criterion KMAX. */
+
+    jmax = f2cmin(*kmax,minmn);
+
+/*     =================================================================== */
+
+/*     Quick return if possible for the case when the second or third */
+/*     stopping criterion for the whole original matrix is satified, */
+/*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */
+/*     (which is ONE <= RELTOL). */
+
+    if (maxc2nrm <= *abstol || 1.f <= *reltol) {
+
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.f;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j;
+	    tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+	}
+
+	q__1.r = (real) lwkopt, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+	return 0;
+    }
+
+/*     ================================================================== */
+/*     Factorize columns */
+/*     ================================================================== */
+
+/*     Determine the block size. */
+
+    nbmin = 2;
+    nx = 0;
+
+    if (nb > 1 && nb < minmn) {
+
+/*        Determine when to cross over from blocked to unblocked code. */
+/*        (for N less than NX, unblocked code should be used). */
+
+/* Computing MAX */
+	i__1 = 0, i__2 = ilaenv_(&c__3, "CGEQP3RK", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)8, (ftnlen)1);
+	nx = f2cmax(i__1,i__2);
+
+	if (nx < minmn) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    if (*lwork < lwkopt) {
+
+/*              Not enough workspace to use optimal block size that */
+/*              is currently stored in NB. */
+/*              Reduce NB and determine the minimum value of NB. */
+
+		nb = (*lwork - (*n << 1)) / (*n + 1);
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "CGEQP3RK", " ", m, n, &c_n1, 
+			&c_n1, (ftnlen)8, (ftnlen)1);
+		nbmin = f2cmax(i__1,i__2);
+
+	    }
+	}
+    }
+
+/*     ================================================================== */
+
+/*     DONE is the boolean flag to rerpresent the case when the */
+/*     factorization completed in the block factorization routine, */
+/*     before the end of the block. */
+
+    done = FALSE_;
+
+/*     J is the column index. */
+
+    j = 1;
+
+/*     (1) Use blocked code initially. */
+
+/*     JMAXB is the maximum column index of the block, when the */
+/*     blocked code is used, is also limited by the first stopping */
+/*     criterion KMAX. */
+
+/* Computing MIN */
+    i__1 = *kmax, i__2 = minmn - nx;
+    jmaxb = f2cmin(i__1,i__2);
+
+    if (nb >= nbmin && nb < jmax && jmaxb > 0) {
+
+/*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */
+/*        J   is the column index of a column block; */
+/*        JB  is the column block size to pass to block factorization */
+/*            routine in a loop step; */
+/*        JBF is the number of columns that were actually factorized */
+/*            that was returned by the block factorization routine */
+/*            in a loop step, JBF <= JB; */
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	while(j <= jmaxb) {
+
+/* Computing MIN */
+	    i__1 = nb, i__2 = jmaxb - j + 1;
+	    jb = f2cmin(i__1,i__2);
+	    n_sub__ = *n - j + 1;
+	    ioffset = j - 1;
+
+/*           Factorize JB columns among the columns A(J:N). */
+
+	    i__1 = *n + *nrhs - j + 1;
+	    claqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, 
+		    &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, 
+		    maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], &
+		    rwork[*n + j], &work[1], &work[jb + 1], &i__1, &iwork[1], 
+		    &iinfo);
+
+/*           Set INFO on the first occurence of Inf. */
+
+	    if (iinfo > n_sub__ && *info == 0) {
+		*info = (ioffset << 1) + iinfo;
+	    }
+
+	    if (done) {
+
+/*              Either the submatrix is zero before the end of the */
+/*              column block, or ABSTOL or RELTOL criterion is */
+/*              satisfied before the end of the column block, we can */
+/*              return from the routine. Perform the following before */
+/*              returning: */
+/*                a) Set the number of factorized columns K, */
+/*                   K = IOFFSET + JBF from the last call of blocked */
+/*                   routine. */
+/*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */
+/*                         by the block factorization routine; */
+/*                      2) The remaining TAUs are set to ZERO by the */
+/*                         block factorization routine. */
+
+		*k = ioffset + jbf;
+
+/*              Set INFO on the first occurrence of NaN, NaN takes */
+/*              prcedence over Inf. */
+
+		if (iinfo <= n_sub__ && iinfo > 0) {
+		    *info = ioffset + iinfo;
+		}
+
+/*              Return from the routine. */
+
+		q__1.r = (real) lwkopt, q__1.i = 0.f;
+		work[1].r = q__1.r, work[1].i = q__1.i;
+
+		return 0;
+
+	    }
+
+	    j += jbf;
+
+	}
+
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+/*     J = JMAX+1 means we factorized the maximum possible number of */
+/*     columns, that is in ELSE clause we need to compute */
+/*     the MAXC2NORM and RELMAXC2NORM to return after we processed */
+/*     the blocks. */
+
+    if (j <= jmax) {
+
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	n_sub__ = *n - j + 1;
+	ioffset = j - 1;
+
+	i__1 = jmax - j + 1;
+	claqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, &
+		maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, 
+		relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], &rwork[*n + j], &
+		work[1], &iinfo);
+
+/*        ABSTOL or RELTOL criterion is satisfied when the number of */
+/*        the factorized columns KF is smaller then the  number */
+/*        of columns JMAX-J+1 supplied to be factorized by the */
+/*        unblocked routine, we can return from */
+/*        the routine. Perform the following before returning: */
+/*           a) Set the number of factorized columns K, */
+/*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the */
+/*              unblocked factorization routine above. */
+
+	*k = j - 1 + kf;
+
+/*        Set INFO on the first exception occurence. */
+
+/*        Set INFO on the first exception occurence of Inf or NaN, */
+/*        (NaN takes precedence over Inf). */
+
+	if (iinfo > n_sub__ && *info == 0) {
+	    *info = (ioffset << 1) + iinfo;
+	} else if (iinfo <= n_sub__ && iinfo > 0) {
+	    *info = ioffset + iinfo;
+	}
+
+    } else {
+
+/*        Compute the return values for blocked code. */
+
+/*        Set the number of factorized columns if the unblocked routine */
+/*        was not called. */
+
+	*k = jmax;
+
+/*        If there exits a residual matrix after the blocked code: */
+/*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */
+/*              residual matrix, otherwise set them to ZERO; */
+/*           2) Set TAU(K+1:MINMN) to ZERO. */
+
+	if (*k < minmn) {
+	    i__1 = *n - *k;
+	    jmaxc2nrm = *k + isamax_(&i__1, &rwork[*k + 1], &c__1);
+	    *maxc2nrmk = rwork[jmaxc2nrm];
+	    if (*k == 0) {
+		*relmaxc2nrmk = 1.f;
+	    } else {
+		*relmaxc2nrmk = *maxc2nrmk / maxc2nrm;
+	    }
+
+	    i__1 = minmn;
+	    for (j = *k + 1; j <= i__1; ++j) {
+		i__2 = j;
+		tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+	    }
+
+	} else {
+	    *maxc2nrmk = 0.f;
+	    *relmaxc2nrmk = 0.f;
+
+	}
+
+/*     END IF( J.LE.JMAX ) THEN */
+
+    }
+
+    q__1.r = (real) lwkopt, q__1.i = 0.f;
+    work[1].r = q__1.r, work[1].i = q__1.i;
+
+    return 0;
+
+/*     End of CGEQP3RK */
+
+} /* cgeqp3rk_ */
+
diff --git a/lapack-netlib/SRC/cgeqp3rk.f b/lapack-netlib/SRC/cgeqp3rk.f
new file mode 100644
index 000000000..731c44edb
--- /dev/null
+++ b/lapack-netlib/SRC/cgeqp3rk.f
@@ -0,0 +1,1093 @@
+*> \brief \b CGEQP3RK computes a truncated Householder QR factorization with column pivoting of a complex m-by-n matrix A by using Level 3 BLAS and overwrites m-by-nrhs matrix B with Q**H * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CGEQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgeqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgeqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgeqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+*      $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*      $                     WORK, LWORK, RWORK, IWORK, INFO )
+*       IMPLICIT NONE
+*
+*      .. Scalar Arguments ..
+*       INTEGER            INFO, K, KMAX, LDA, LWORK, M, N, NRHS
+*       REAL               ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*      ..
+*      .. Array Arguments ..
+*       INTEGER            IWORK( * ), JPIV( * )
+*       REAL               RWORK( * )
+*       COMPLEX            A( LDA, * ), TAU( * ), WORK( * )
+*      ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CGEQP3RK performs two tasks simultaneously:
+*>
+*> Task 1: The routine computes a truncated (rank K) or full rank
+*> Householder QR factorization with column pivoting of a complex
+*> M-by-N matrix A using Level 3 BLAS. K is the number of columns
+*> that were factorized, i.e. factorization rank of the
+*> factor R, K <= min(M,N).
+*>
+*>  A * P(K) = Q(K) * R(K)  =
+*>
+*>        = Q(K) * ( R11(K) R12(K) ) = Q(K) * (   R(K)_approx    )
+*>                 ( 0      R22(K) )          ( 0  R(K)_residual ),
+*>
+*> where:
+*>
+*>  P(K)            is an N-by-N permutation matrix;
+*>  Q(K)            is an M-by-M unitary matrix;
+*>  R(K)_approx   = ( R11(K), R12(K) ) is a rank K approximation of the
+*>                    full rank factor R with K-by-K upper-triangular
+*>                    R11(K) and K-by-N rectangular R12(K). The diagonal
+*>                    entries of R11(K) appear in non-increasing order
+*>                    of absolute value, and absolute values of all of
+*>                    them exceed the maximum column 2-norm of R22(K)
+*>                    up to roundoff error.
+*>  R(K)_residual = R22(K) is the residual of a rank K approximation
+*>                    of the full rank factor R. It is a
+*>                    an (M-K)-by-(N-K) rectangular matrix;
+*>  0               is a an (M-K)-by-K zero matrix.
+*>
+*> Task 2: At the same time, the routine overwrites a complex M-by-NRHS
+*> matrix B with  Q(K)**H * B  using Level 3 BLAS.
+*>
+*> =====================================================================
+*>
+*> The matrices A and B are stored on input in the array A as
+*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS)
+*> respectively.
+*>
+*>                                  N     NRHS
+*>             array_A   =   M  [ mat_A, mat_B ]
+*>
+*> The truncation criteria (i.e. when to stop the factorization)
+*> can be any of the following:
+*>
+*>   1) The input parameter KMAX, the maximum number of columns
+*>      KMAX to factorize, i.e. the factorization rank is limited
+*>      to KMAX. If KMAX >= min(M,N), the criterion is not used.
+*>
+*>   2) The input parameter ABSTOL, the absolute tolerance for
+*>      the maximum column 2-norm of the residual matrix R22(K). This
+*>      means that the factorization stops if this norm is less or
+*>      equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used.
+*>
+*>   3) The input parameter RELTOL, the tolerance for the maximum
+*>      column 2-norm matrix of the residual matrix R22(K) divided
+*>      by the maximum column 2-norm of the original matrix A, which
+*>      is equal to abs(R(1,1)). This means that the factorization stops
+*>      when the ratio of the maximum column 2-norm of R22(K) to
+*>      the maximum column 2-norm of A is less than or equal to RELTOL.
+*>      If RELTOL < 0.0, the criterion is not used.
+*>
+*>   4) In case both stopping criteria ABSTOL or RELTOL are not used,
+*>      and when the residual matrix R22(K) is a zero matrix in some
+*>      factorization step K. ( This stopping criterion is implicit. )
+*>
+*>  The algorithm stops when any of these conditions is first
+*>  satisfied, otherwise the whole matrix A is factorized.
+*>
+*>  To factorize the whole matrix A, use the values
+*>  KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0.
+*>
+*>  The routine returns:
+*>     a) Q(K), R(K)_approx = ( R11(K), R12(K) ),
+*>        R(K)_residual = R22(K), P(K), i.e. the resulting matrices
+*>        of the factorization; P(K) is represented by JPIV,
+*>        ( if K = min(M,N), R(K)_approx is the full factor R,
+*>        and there is no residual matrix R(K)_residual);
+*>     b) K, the number of columns that were factorized,
+*>        i.e. factorization rank;
+*>     c) MAXC2NRMK, the maximum column 2-norm of the residual
+*>        matrix R(K)_residual = R22(K),
+*>        ( if K = min(M,N), MAXC2NRMK = 0.0 );
+*>     d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum
+*>        column 2-norm of the original matrix A, which is equal
+*>        to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 );
+*>     e) Q(K)**H * B, the matrix B with the unitary
+*>        transformation Q(K)**H applied on the left.
+*>
+*> The N-by-N permutation matrix P(K) is stored in a compact form in
+*> the integer array JPIV. For 1 <= j <= N, column j
+*> of the matrix A was interchanged with column JPIV(j).
+*>
+*> The M-by-M unitary matrix Q is represented as a product
+*> of elementary Householder reflectors
+*>
+*>     Q(K) = H(1) *  H(2) * . . . * H(K),
+*>
+*> where K is the number of columns that were factorized.
+*>
+*> Each H(j) has the form
+*>
+*>     H(j) = I - tau * v * v**H,
+*>
+*> where 1 <= j <= K and
+*>   I    is an M-by-M identity matrix,
+*>   tau  is a complex scalar,
+*>   v    is a complex vector with v(1:j-1) = 0 and v(j) = 1.
+*>
+*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j).
+*>
+*> See the Further Details section for more information.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e. the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M,N), then this stopping criterion
+*>                is not used, the routine factorizes columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>                satisfied on input and the routine exits immediately.
+*>                This means that the factorization is not performed,
+*>                the matrices A and B are not modified, and
+*>                the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL
+*>
+*>          The second factorization stopping criterion, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix R22(K).
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix R22(K)
+*>          is less than or equal to ABSTOL. Let SAFMIN = DLAMCH('S').
+*>
+*>          a) If ABSTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -5 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN
+*>                is used. This includes the case ABSTOL = -0.0.
+*>
+*>          d) If 2*SAFMIN <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If ABSTOL chosen above is >= MAXC2NRM, then this
+*>          stopping criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed. The routine
+*>          returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case ABSTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is REAL
+*>
+*>          The third factorization stopping criterion, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio
+*>          abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of
+*>          the residual matrix R22(K) to the maximum column 2-norm of
+*>          the original matrix A. The algorithm converges (stops the
+*>          factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less
+*>          than or equal to RELTOL. Let EPS = DLAMCH('E').
+*>
+*>          a) If RELTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -6 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used.
+*>                This includes the case RELTOL = -0.0.
+*>
+*>          d) If EPS <= RELTOL then the input value of RELTOL
+*>                is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If RELTOL chosen above is >= 1.0, then this stopping
+*>          criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed.
+*>          The routine returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case RELTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*>
+*>          NOTE: We recommend that RELTOL satisfy
+*>                min( 10*max(M,N)*EPS, sqrt(EPS) ) <= RELTOL
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N+NRHS)
+*>
+*>          On entry:
+*>
+*>          a) The subarray A(1:M,1:N) contains the M-by-N matrix A.
+*>          b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS
+*>             matrix B.
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>
+*>          a) The subarray A(1:M,1:N) contains parts of the factors
+*>             of the matrix A:
+*>
+*>            1) If K = 0, A(1:M,1:N) contains the original matrix A.
+*>            2) If K > 0, A(1:M,1:N) contains parts of the
+*>            factors:
+*>
+*>              1. The elements below the diagonal of the subarray
+*>                 A(1:M,1:K) together with TAU(1:K) represent the
+*>                 unitary matrix Q(K) as a product of K Householder
+*>                 elementary reflectors.
+*>
+*>              2. The elements on and above the diagonal of
+*>                 the subarray A(1:K,1:N) contain K-by-N
+*>                 upper-trapezoidal matrix
+*>                 R(K)_approx = ( R11(K), R12(K) ).
+*>                 NOTE: If K=min(M,N), i.e. full rank factorization,
+*>                       then R_approx(K) is the full factor R which
+*>                       is upper-trapezoidal. If, in addition, M>=N,
+*>                       then R is upper-triangular.
+*>
+*>              3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K)
+*>                 rectangular matrix R(K)_residual = R22(K).
+*>
+*>          b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains
+*>             the M-by-NRHS product Q(K)**H * B.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*>          This is the leading dimension for both matrices, A and B.
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*>
+*>          NOTE: If K = 0, a) the arrays A and B are not modified;
+*>                          b) the array TAU(1:min(M,N)) is set to ZERO,
+*>                             if the matrix A does not contain NaN,
+*>                             otherwise the elements TAU(1:min(M,N))
+*>                             are undefined;
+*>                          c) the elements of the array JPIV are set
+*>                             as follows: for j = 1:N, JPIV(j) = j.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is REAL
+*>          The maximum column 2-norm of the residual matrix R22(K),
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then MAXC2NRMK equals the maximum column 2-norm
+*>             of the original matrix A.
+*>
+*>          b) If 0 < K < min(M,N), then MAXC2NRMK is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then MAXC2NRMK = 0.0.
+*>
+*>          NOTE: MAXC2NRMK in the factorization step K would equal
+*>                R(K+1,K+1) in the next factorization step K+1.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is REAL
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix R22(K) (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then RELMAXC2NRMK = 1.0.
+*>
+*>          b) If 0 < K < min(M,N), then
+*>                RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then RELMAXC2NRMK = 0.0.
+*>
+*>         NOTE: RELMAXC2NRMK in the factorization step K would equal
+*>               abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization
+*>               step K+1.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices. For 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*>
+*>          The elements of the array JPIV(1:N) are always set
+*>          by the routine, for example, even  when no columns
+*>          were factorized, i.e. when K = 0, the elements are
+*>          set as JPIV(j) = j for j = 1:N.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX array, dimension (min(M,N))
+*>          The scalar factors of the elementary reflectors.
+*>
+*>          If 0 < K <= min(M,N), only the elements TAU(1:K) of
+*>          the array TAU are modified by the factorization.
+*>          After the factorization computed, if no NaN was found
+*>          during the factorization, the remaining elements
+*>          TAU(K+1:min(M,N)) are set to zero, otherwise the
+*>          elements TAU(K+1:min(M,N)) are not set and therefore
+*>          undefined.
+*>          ( If K = 0, all elements of TAU are set to zero, if
+*>          the matrix A does not contain NaN. )
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and
+*>          LWORK >= N+NRHS-1, otherwise.
+*>          For optimal performance LWORK >= NB*( N+NRHS+1 ),
+*>          where NB is the optimal block size for CGEQP3RK returned
+*>          by ILAENV. Minimal block size MINNB=2.
+*>
+*>          NOTE: The decision, whether to use unblocked BLAS 2
+*>          or blocked BLAS 3 code is based not only on the dimension
+*>          LWORK of the availbale workspace WORK, but also also on the
+*>          matrix A dimension N via crossover point NX returned
+*>          by ILAENV. (For N less than NX, unblocked code should be
+*>          used.)
+*>
+*>          If LWORK = -1, then a workspace query is assumed;
+*>          the routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (2*N)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix in the blocked step auxiliary subroutine CLAQP3RK ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) INFO < 0: if INFO = -i, the i-th argument had an
+*>                       illegal value.
+*>          3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup geqp3rk
+*
+*> \par Further Details:
+*  =====================
+*
+*> \verbatim
+*> CGEQP3RK is based on the same BLAS3 Householder QR factorization
+*> algorithm with column pivoting as in CGEQP3 routine which uses
+*> CLARFG routine to generate Householder reflectors
+*> for QR factorization.
+*>
+*> We can also write:
+*>
+*>   A = A_approx(K) + A_residual(K)
+*>
+*> The low rank approximation matrix A(K)_approx from
+*> the truncated QR factorization of rank K of the matrix A is:
+*>
+*>   A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T
+*>                        (     0     0 )
+*>
+*>               = Q(K) * ( R11(K) R12(K) ) * P(K)**T
+*>                        (      0      0 )
+*>
+*> The residual A_residual(K) of the matrix A is:
+*>
+*>   A_residual(K) = Q(K) * ( 0              0 ) * P(K)**T =
+*>                          ( 0  R(K)_residual )
+*>
+*>                 = Q(K) * ( 0        0 ) * P(K)**T
+*>                          ( 0   R22(K) )
+*>
+*> The truncated (rank K) factorization guarantees that
+*> the maximum column 2-norm of A_residual(K) is less than
+*> or equal to MAXC2NRMK up to roundoff error.
+*>
+*> NOTE: An approximation of the null vectors
+*>       of A can be easily computed from R11(K)
+*>       and R12(K):
+*>
+*>       Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) )
+*>                                 (         -I           )
+*>
+*> \endverbatim
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+     $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     WORK, LWORK, RWORK, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS
+      REAL               ABSTOL,  MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      REAL               RWORK( * )
+      COMPLEX            A( LDA, * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            INB, INBMIN, IXOVER
+      PARAMETER          ( INB = 1, INBMIN = 2, IXOVER = 3 )
+      REAL               ZERO, ONE, TWO
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 )
+      COMPLEX            CZERO
+      PARAMETER          ( CZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, DONE
+      INTEGER            IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX,
+     $                   JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB,
+     $                   NBMIN, NX
+      REAL               EPS, HUGEVAL, MAXC2NRM, SAFMIN
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CLAQP2RK, CLAQP3RK, XERBLA
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      INTEGER            ISAMAX, ILAENV
+      REAL               SLAMCH, SCNRM2, SROUNDUP_LWORK
+      EXTERNAL           SISNAN, SLAMCH, SCNRM2, ISAMAX, ILAENV,
+     $                   SROUNDUP_LWORK
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CMPLX, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test input arguments
+*     ====================
+*
+      INFO = 0
+      LQUERY = ( LWORK.EQ.-1 )
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KMAX.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( SISNAN( ABSTOL ) ) THEN
+         INFO = -5
+      ELSE IF( SISNAN( RELTOL ) ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -8
+      END IF
+*
+*     If the input parameters M, N, NRHS, KMAX, LDA are valid:
+*       a) Test the input workspace size LWORK for the minimum
+*          size requirement IWS.
+*       b) Determine the optimal block size NB and optimal
+*          workspace size LWKOPT to be returned in WORK(1)
+*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE.,
+*          (3) when routine exits.
+*     Here, IWS is the miminum workspace required for unblocked
+*     code.
+*
+      IF( INFO.EQ.0 ) THEN
+         MINMN = MIN( M, N )
+         IF( MINMN.EQ.0 ) THEN
+            IWS = 1
+            LWKOPT = 1
+         ELSE
+*
+*           Minimal workspace size in case of using only unblocked
+*           BLAS 2 code in CLAQP2RK.
+*           1) CLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in CLARF subroutine inside CLAQP2RK to apply an
+*              elementary reflector from the left.
+*           TOTAL_WORK_SIZE = 3*N + NRHS - 1
+*
+            IWS = N + NRHS - 1
+*
+*           Assign to NB optimal block size.
+*
+            NB = ILAENV( INB, 'CGEQP3RK', ' ', M, N, -1, -1 )
+*
+*           A formula for the optimal workspace size in case of using
+*           both unblocked BLAS 2 in CLAQP2RK and blocked BLAS 3 code
+*           in CLAQP3RK.
+*           1) CGEQP3RK, CLAQP2RK, CLAQP3RK: 2*N to store full and
+*              partial column 2-norms.
+*           2) CLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in CLARF subroutine to apply an elementary reflector
+*              from the left.
+*           3) CLAQP3RK: NB*(N+NRHS) to use in the work array F that
+*              is used to apply a block reflector from
+*              the left.
+*           4) CLAQP3RK: NB to use in the auxilixary array AUX.
+*           Sizes (2) and ((3) + (4)) should intersect, therefore
+*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2.
+*
+            LWKOPT = 2*N + NB*( N+NRHS+1 )
+         END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
+         IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+*      NOTE: The optimal workspace size is returned in WORK(1), if
+*            the input parameters M, N, NRHS, KMAX, LDA are valid.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CGEQP3RK', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible for M=0 or N=0.
+*
+      IF( MINMN.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+*     Initialize column pivot array JPIV.
+*
+      DO J = 1, N
+         JPIV( J ) = J
+      END DO
+*
+*     ==================================================================
+*
+*     Initialize storage for partial and exact column 2-norms.
+*     a) The elements WORK(1:N) are used to store partial column
+*        2-norms of the matrix A, and may decrease in each computation
+*        step; initialize to the values of complete columns 2-norms.
+*     b) The elements WORK(N+1:2*N) are used to store complete column
+*        2-norms of the matrix A, they are not changed during the
+*        computation; initialize the values of complete columns 2-norms.
+*
+      DO J = 1, N
+         RWORK( J ) = SCNRM2( M, A( 1, J ), 1 )
+         RWORK( N+J ) = RWORK( J )
+      END DO
+*
+*     ==================================================================
+*
+*     Compute the pivot column index and the maximum column 2-norm
+*     for the whole original matrix stored in A(1:M,1:N).
+*
+      KP1 = ISAMAX( N, RWORK( 1 ), 1 )
+*
+*     ==================================================================.
+*
+      IF( SISNAN( MAXC2NRM ) ) THEN
+*
+*        Check if the matrix A contains NaN, set INFO parameter
+*        to the column number where the first NaN is found and return
+*        from the routine.
+*
+         K = 0
+         INFO = KP1
+*
+*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = MAXC2NRM
+*
+*        Array TAU is not set and contains undefined elements.
+*
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+      END IF
+*
+*     ===================================================================
+*
+      IF( MAXC2NRM.EQ.ZERO ) THEN
+*
+*        Check is the matrix A is a zero matrix, set array TAU and
+*        return from the routine.
+*
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+*
+         DO J = 1, MINMN
+            TAU( J ) = CZERO
+         END DO
+*
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+*
+      END IF
+*
+*     ===================================================================
+*
+      HUGEVAL = SLAMCH( 'Overflow' )
+*
+      IF( MAXC2NRM.GT.HUGEVAL ) THEN
+*
+*        Check if the matrix A contains +Inf or -Inf, set INFO parameter
+*        to the column number, where the first +/-Inf  is found plus N,
+*        and continue the computation.
+*
+         INFO = N + KP1
+*
+      END IF
+*
+*     ==================================================================
+*
+*     Quick return if possible for the case when the first
+*     stopping criterion is satisfied, i.e. KMAX = 0.
+*
+      IF( KMAX.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+         DO J = 1, MINMN
+            TAU( J ) = CZERO
+         END DO
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+      EPS = SLAMCH('Epsilon')
+*
+*     Adjust ABSTOL
+*
+      IF( ABSTOL.GE.ZERO ) THEN
+         SAFMIN = SLAMCH('Safe minimum')
+         ABSTOL = MAX( ABSTOL, TWO*SAFMIN )
+      END IF
+*
+*     Adjust RELTOL
+*
+      IF( RELTOL.GE.ZERO ) THEN
+         RELTOL = MAX( RELTOL, EPS )
+      END IF
+*
+*     ===================================================================
+*
+*     JMAX is the maximum index of the column to be factorized,
+*     which is also limited by the first stopping criterion KMAX.
+*
+      JMAX = MIN( KMAX, MINMN )
+*
+*     ===================================================================
+*
+*     Quick return if possible for the case when the second or third
+*     stopping criterion for the whole original matrix is satified,
+*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL
+*     (which is ONE <= RELTOL).
+*
+      IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN
+*
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+*
+         DO J = 1, MINMN
+            TAU( J ) = CZERO
+         END DO
+*
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*     Factorize columns
+*     ==================================================================
+*
+*     Determine the block size.
+*
+      NBMIN = 2
+      NX = 0
+*
+      IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN
+*
+*        Determine when to cross over from blocked to unblocked code.
+*        (for N less than NX, unblocked code should be used).
+*
+         NX = MAX( 0, ILAENV( IXOVER, 'CGEQP3RK', ' ', M, N, -1, -1 ) )
+*
+         IF( NX.LT.MINMN ) THEN
+*
+*           Determine if workspace is large enough for blocked code.
+*
+            IF( LWORK.LT.LWKOPT ) THEN
+*
+*              Not enough workspace to use optimal block size that
+*              is currently stored in NB.
+*              Reduce NB and determine the minimum value of NB.
+*
+               NB = ( LWORK-2*N ) / ( N+1 )
+               NBMIN = MAX( 2, ILAENV( INBMIN, 'CGEQP3RK', ' ', M, N,
+     $                 -1, -1 ) )
+*
+            END IF
+         END IF
+      END IF
+*
+*     ==================================================================
+*
+*     DONE is the boolean flag to rerpresent the case when the
+*     factorization completed in the block factorization routine,
+*     before the end of the block.
+*
+      DONE = .FALSE.
+*
+*     J is the column index.
+*
+      J = 1
+*
+*     (1) Use blocked code initially.
+*
+*     JMAXB is the maximum column index of the block, when the
+*     blocked code is used, is also limited by the first stopping
+*     criterion KMAX.
+*
+      JMAXB = MIN( KMAX, MINMN - NX )
+*
+      IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN
+*
+*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here:
+*        J   is the column index of a column block;
+*        JB  is the column block size to pass to block factorization
+*            routine in a loop step;
+*        JBF is the number of columns that were actually factorized
+*            that was returned by the block factorization routine
+*            in a loop step, JBF <= JB;
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         DO WHILE( J.LE.JMAXB )
+*
+            JB = MIN( NB, JMAXB-J+1 )
+            N_SUB = N-J+1
+            IOFFSET = J-1
+*
+*           Factorize JB columns among the columns A(J:N).
+*
+            CALL CLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                     DONE, JBF, MAXC2NRMK, RELMAXC2NRMK,
+     $                     JPIV( J ), TAU( J ),
+     $                     RWORK( J ), RWORK( N+J ),
+     $                     WORK( 1 ), WORK( JB+1 ),
+     $                     N+NRHS-J+1, IWORK, IINFO )
+*
+*           Set INFO on the first occurence of Inf.
+*
+            IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+               INFO = 2*IOFFSET + IINFO
+            END IF
+*
+            IF( DONE ) THEN
+*
+*              Either the submatrix is zero before the end of the
+*              column block, or ABSTOL or RELTOL criterion is
+*              satisfied before the end of the column block, we can
+*              return from the routine. Perform the following before
+*              returning:
+*                a) Set the number of factorized columns K,
+*                   K = IOFFSET + JBF from the last call of blocked
+*                   routine.
+*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned
+*                         by the block factorization routine;
+*                      2) The remaining TAUs are set to ZERO by the
+*                         block factorization routine.
+*
+               K = IOFFSET + JBF
+*
+*              Set INFO on the first occurrence of NaN, NaN takes
+*              prcedence over Inf.
+*
+               IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+                  INFO = IOFFSET + IINFO
+               END IF
+*
+*              Return from the routine.
+*
+               WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
+               RETURN
+*
+            END IF
+*
+            J = J + JBF
+*
+         END DO
+*
+      END IF
+*
+*     Use unblocked code to factor the last or only block.
+*     J = JMAX+1 means we factorized the maximum possible number of
+*     columns, that is in ELSE clause we need to compute
+*     the MAXC2NORM and RELMAXC2NORM to return after we processed
+*     the blocks.
+*
+      IF( J.LE.JMAX ) THEN
+*
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         N_SUB = N-J+1
+         IOFFSET = J-1
+*
+         CALL CLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1,
+     $                  ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                  KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ),
+     $                  TAU( J ), RWORK( J ), RWORK( N+J ),
+     $                  WORK( 1 ), IINFO )
+*
+*        ABSTOL or RELTOL criterion is satisfied when the number of
+*        the factorized columns KF is smaller then the  number
+*        of columns JMAX-J+1 supplied to be factorized by the
+*        unblocked routine, we can return from
+*        the routine. Perform the following before returning:
+*           a) Set the number of factorized columns K,
+*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the
+*              unblocked factorization routine above.
+*
+         K = J - 1 + KF
+*
+*        Set INFO on the first exception occurence.
+*
+*        Set INFO on the first exception occurence of Inf or NaN,
+*        (NaN takes precedence over Inf).
+*
+         IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+            INFO = 2*IOFFSET + IINFO
+         ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+            INFO = IOFFSET + IINFO
+         END IF
+*
+      ELSE
+*
+*        Compute the return values for blocked code.
+*
+*        Set the number of factorized columns if the unblocked routine
+*        was not called.
+*
+            K = JMAX
+*
+*        If there exits a residual matrix after the blocked code:
+*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the
+*              residual matrix, otherwise set them to ZERO;
+*           2) Set TAU(K+1:MINMN) to ZERO.
+*
+         IF( K.LT.MINMN ) THEN
+            JMAXC2NRM = K + ISAMAX( N-K, RWORK( K+1 ), 1 )
+            MAXC2NRMK = RWORK( JMAXC2NRM )
+            IF( K.EQ.0 ) THEN
+               RELMAXC2NRMK = ONE
+            ELSE
+               RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+            END IF
+*
+            DO J = K + 1, MINMN
+               TAU( J ) = CZERO
+            END DO
+*
+         ELSE
+            MAXC2NRMK = ZERO
+            RELMAXC2NRMK = ZERO
+*
+         END IF
+*
+*     END IF( J.LE.JMAX ) THEN
+*
+      END IF
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
+      RETURN
+*
+*     End of CGEQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/cgeqr.f b/lapack-netlib/SRC/cgeqr.f
index d10e3da65..3617594d0 100644
--- a/lapack-netlib/SRC/cgeqr.f
+++ b/lapack-netlib/SRC/cgeqr.f
@@ -99,7 +99,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -168,6 +168,8 @@
 *>
 *> \endverbatim
 *>
+*> \ingroup geqr
+*>
 *  =====================================================================
       SUBROUTINE CGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
@@ -188,11 +190,12 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, LMINWS, MINT, MINW
-      INTEGER            MB, NB, MINTSZ, NBLCKS
+      INTEGER            MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLATSQR, CGEQRT, XERBLA
@@ -244,8 +247,10 @@
 *
 *     Determine if the workspace size satisfies minimal size
 *
+      LWMIN = MAX( 1, N )
+      LWREQ = MAX( 1, N*NB )
       LMINWS = .FALSE.
-      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N )
+      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ )
      $    .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ )
      $    .AND. ( .NOT.LQUERY ) ) THEN
         IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN
@@ -253,7 +258,7 @@
           NB = 1
           MB = M
         END IF
-        IF( LWORK.LT.NB*N ) THEN
+        IF( LWORK.LT.LWREQ ) THEN
           LMINWS = .TRUE.
           NB = 1
         END IF
@@ -268,7 +273,7 @@
       ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 )
      $   .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -6
-      ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY )
+      ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY )
      $   .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -8
       END IF
@@ -282,9 +287,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = MAX( 1, N )
+          WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
         ELSE
-          WORK( 1 ) = MAX( 1, NB*N )
+          WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -309,7 +314,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = MAX( 1, NB*N )
+      WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgeqrfp.f b/lapack-netlib/SRC/cgeqrfp.f
index eaf98ddf3..5b6226c67 100644
--- a/lapack-netlib/SRC/cgeqrfp.f
+++ b/lapack-netlib/SRC/cgeqrfp.f
@@ -97,7 +97,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is
 *>          the optimal blocksize.
 *>
@@ -162,8 +163,8 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB,
-     $                   NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEQR2P, CLARFB, CLARFT, XERBLA
@@ -182,8 +183,16 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'CGEQRF', ' ', M, N, -1, -1 )
-      LWKOPT = N*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      K = MIN( M, N )
+      IF( K.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = N
+         LWKOPT = N*NB
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -191,7 +200,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
@@ -203,7 +212,6 @@
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -211,7 +219,7 @@
 *
       NBMIN = 2
       NX = 0
-      IWS = N
+      IWS = LWKMIN
       IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
 *        Determine when to cross over from blocked to unblocked code.
@@ -273,7 +281,7 @@
      $   CALL CGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = SROUNDUP_LWORK(IWS)
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of CGEQRFP
diff --git a/lapack-netlib/SRC/cgesvdx.f b/lapack-netlib/SRC/cgesvdx.f
index fbdb121ca..e1856a65f 100644
--- a/lapack-netlib/SRC/cgesvdx.f
+++ b/lapack-netlib/SRC/cgesvdx.f
@@ -208,7 +208,7 @@
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
-*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK;
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
@@ -261,7 +261,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsing
+*> \ingroup gesvdx
 *
 *  =====================================================================
       SUBROUTINE CGESVDX( JOBU, JOBVT, RANGE, M, N, A, LDA, VL, VU,
@@ -312,8 +312,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, CLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANGE
+      REAL               SLAMCH, CLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN, SQRT
@@ -448,7 +448,7 @@
             END IF
          END IF
          MAXWRK = MAX( MAXWRK, MINWRK )
-         WORK( 1 ) = CMPLX( REAL( MAXWRK ), ZERO )
+         WORK( 1 ) = SROUNDUP_LWORK( MAXWRK )
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -19
@@ -464,7 +464,7 @@
 *
 *     Quick return if possible
 *
-      IF( M.EQ.0 .OR. N.EQ.0 ) THEN
+      IF( MINMN.EQ.0 ) THEN
          RETURN
       END IF
 *
@@ -846,7 +846,7 @@
 *
 *     Return optimal workspace in WORK(1)
 *
-      WORK( 1 ) = CMPLX( REAL( MAXWRK ), ZERO )
+      WORK( 1 ) = SROUNDUP_LWORK( MAXWRK )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgesvj.f b/lapack-netlib/SRC/cgesvj.f
index 149cf5e48..b9c8f1709 100644
--- a/lapack-netlib/SRC/cgesvj.f
+++ b/lapack-netlib/SRC/cgesvj.f
@@ -208,15 +208,17 @@
 *> \verbatim
 *>          CWORK is COMPLEX array, dimension (max(1,LWORK))
 *>          Used as workspace.
-*>          If on entry LWORK = -1, then a workspace query is assumed and
-*>          no computation is done; CWORK(1) is set to the minial (and optimal)
-*>          length of CWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER.
-*>          Length of CWORK, LWORK >= M+N.
+*>          Length of CWORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M+N, otherwise.
+*>
+*>          If on entry LWORK = -1, then a workspace query is assumed and
+*>          no computation is done; CWORK(1) is set to the minial (and optimal)
+*>          length of CWORK.
 *> \endverbatim
 *>
 *> \param[in,out] RWORK
@@ -247,15 +249,17 @@
 *>          RWORK(6) = the largest absolute value over all sines of the
 *>                    Jacobi rotation angles in the last sweep. It can be
 *>                    useful for a post festum analysis.
-*>         If on entry LRWORK = -1, then a workspace query is assumed and
-*>         no computation is done; RWORK(1) is set to the minial (and optimal)
-*>         length of RWORK.
 *> \endverbatim
 *>
 *> \param[in] LRWORK
 *> \verbatim
 *>         LRWORK is INTEGER
-*>         Length of RWORK, LRWORK >= MAX(6,N).
+*>         Length of RWORK.
+*>         LRWORK >= 1, if MIN(M,N) = 0, and LRWORK >= MAX(6,N), otherwise
+*>
+*>         If on entry LRWORK = -1, then a workspace query is assumed and
+*>         no computation is done; RWORK(1) is set to the minial (and optimal)
+*>         length of RWORK.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -276,7 +280,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup gesvj
 *
 *> \par Further Details:
 *  =====================
@@ -374,16 +378,17 @@
       PARAMETER  ( NSWEEP = 30 )
 *     ..
 *     .. Local Scalars ..
-      COMPLEX AAPQ, OMPQ
-      REAL    AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG,
-     $        BIGTHETA, CS, CTOL, EPSLN, MXAAPQ,
-     $        MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL,
-     $        SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL
-      INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1,
-     $        ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34,
-     $        N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND
-      LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK,
-     $        RSVEC, UCTOL, UPPER
+      COMPLEX    AAPQ, OMPQ
+      REAL       AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG,
+     $           BIGTHETA, CS, CTOL, EPSLN, MXAAPQ,
+     $           MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL,
+     $           SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL
+      INTEGER    BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1,
+     $           ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34,
+     $           N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND,
+     $           MINMN, LWMIN, LRWMIN
+      LOGICAL    APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK,
+     $           RSVEC, UCTOL, UPPER
 *     ..
 *     ..
 *     .. Intrinsic Functions ..
@@ -398,8 +403,8 @@
       INTEGER            ISAMAX
       EXTERNAL           ISAMAX
 *     from LAPACK
-      REAL               SLAMCH
-      EXTERNAL           SLAMCH
+      REAL               SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           SLAMCH, SROUNDUP_LWORK
       LOGICAL            LSAME
       EXTERNAL           LSAME
 *     ..
@@ -422,7 +427,16 @@
       UPPER = LSAME( JOBA, 'U' )
       LOWER = LSAME( JOBA, 'L' )
 *
-      LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 )
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWMIN  = 1
+         LRWMIN = 1
+      ELSE
+         LWMIN  = M + N
+         LRWMIN = MAX( 6, N )
+      END IF
+*
+      LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 )
       IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN
          INFO = -1
       ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN
@@ -442,9 +456,9 @@
          INFO = -11
       ELSE IF( UCTOL .AND. ( RWORK( 1 ).LE.ONE ) ) THEN
          INFO = -12
-      ELSE IF( LWORK.LT.( M+N ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN
          INFO = -13
-      ELSE IF( LRWORK.LT.MAX( N, 6 ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LRWORK.LT.LRWMIN .AND. ( .NOT.LQUERY ) ) THEN
          INFO = -15
       ELSE
          INFO = 0
@@ -454,15 +468,15 @@
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CGESVJ', -INFO )
          RETURN
-      ELSE IF ( LQUERY ) THEN
-         CWORK(1) = M + N
-         RWORK(1) = MAX( N, 6 )
+      ELSE IF( LQUERY ) THEN
+         CWORK( 1 ) = SROUNDUP_LWORK( LWMIN )
+         RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN )
          RETURN
       END IF
 *
 * #:) Quick return for void matrix
 *
-      IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN
+      IF( MINMN.EQ.0 ) RETURN
 *
 *     Set numerical parameters
 *     The stopping criterion for Jacobi rotations is
diff --git a/lapack-netlib/SRC/cgetri.f b/lapack-netlib/SRC/cgetri.f
index 2060d1444..2eb3da7ab 100644
--- a/lapack-netlib/SRC/cgetri.f
+++ b/lapack-netlib/SRC/cgetri.f
@@ -153,8 +153,8 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'CGETRI', ' ', N, -1, -1, -1 )
-      LWKOPT = N*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      LWKOPT = MAX( 1, N*NB )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
@@ -252,7 +252,7 @@
      $      CALL CSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 )
    60 CONTINUE
 *
-      WORK( 1 ) = SROUNDUP_LWORK(IWS)
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of CGETRI
diff --git a/lapack-netlib/SRC/cgetsls.f b/lapack-netlib/SRC/cgetsls.f
index b4bb7562f..3f43dc8de 100644
--- a/lapack-netlib/SRC/cgetsls.f
+++ b/lapack-netlib/SRC/cgetsls.f
@@ -127,7 +127,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed.
 *>          If LWORK = -1, the routine calculates optimal size of WORK for the
 *>          optimal performance and returns this value in WORK(1).
@@ -229,7 +229,10 @@
 *
 *     Determine the optimum and minimum LWORK
 *
-       IF( M.GE.N ) THEN
+       IF( MIN( M, N, NRHS ).EQ.0 ) THEN
+         WSIZEO = 1
+         WSIZEM = 1
+       ELSE IF ( M.GE.N ) THEN
          CALL CGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 )
          TSZO = INT( TQ( 1 ) )
          LWO  = INT( WORKQ( 1 ) )
diff --git a/lapack-netlib/SRC/cgetsqrhrt.f b/lapack-netlib/SRC/cgetsqrhrt.f
index 4e4dc1d4a..087e9bc7f 100644
--- a/lapack-netlib/SRC/cgetsqrhrt.f
+++ b/lapack-netlib/SRC/cgetsqrhrt.f
@@ -131,13 +131,15 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
-*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          If MIN(M,N) = 0, LWORK >= 1, else
+*>          LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
-*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ).
+*>
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
@@ -160,7 +162,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup comlpexOTHERcomputational
+*> \ingroup getsqrhrt
 *
 *> \par Contributors:
 *  ==================
@@ -200,6 +202,10 @@
       INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
      $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
 *     ..
+*     .. External Functions ..
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           CCOPY, CLATSQR, CUNGTSQR_ROW, CUNHR_COL,
      $                   XERBLA
@@ -212,7 +218,7 @@
 *     Test the input arguments
 *
       INFO = 0
-      LQUERY  = LWORK.EQ.-1
+      LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
       ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
@@ -225,7 +231,7 @@
          INFO = -5
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -7
-      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN
          INFO = -9
       ELSE
 *
@@ -263,8 +269,9 @@
             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
             LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+            LWORKOPT = MAX( 1, LWORKOPT )
 *
-            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+            IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN
                INFO = -11
             END IF
 *
@@ -277,14 +284,14 @@
          CALL XERBLA( 'CGETSQRHRT', -INFO )
          RETURN
       ELSE IF ( LQUERY ) THEN
-         WORK( 1 ) = CMPLX( LWORKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
       IF( MIN( M, N ).EQ.0 ) THEN
-         WORK( 1 ) = CMPLX( LWORKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
          RETURN
       END IF
 *
@@ -341,9 +348,9 @@
          END IF
       END DO
 *
-      WORK( 1 ) = CMPLX( LWORKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
       RETURN
 *
 *     End of CGETSQRHRT
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/SRC/cgges3.f b/lapack-netlib/SRC/cgges3.f
index aac9f9510..c1ca79688 100644
--- a/lapack-netlib/SRC/cgges3.f
+++ b/lapack-netlib/SRC/cgges3.f
@@ -215,7 +215,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= MAX(1,2*N).
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -260,7 +261,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEeigen
+*> \ingroup gges3
 *
 *  =====================================================================
       SUBROUTINE CGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B,
@@ -300,7 +301,8 @@
       LOGICAL            CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL,
      $                   LQUERY, WANTST
       INTEGER            I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT,
-     $                   ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT
+     $                   ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT,
+     $                   LWKMIN
       REAL               ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL,
      $                   PVSR, SMLNUM
 *     ..
@@ -310,13 +312,12 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEQRF, CGGBAK, CGGBAL, CGGHD3, CLAQZ0, CLACPY,
-     $                   CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, SLABAD,
-     $                   XERBLA
+     $                   CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -353,6 +354,8 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N )
+*
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -369,7 +372,7 @@
          INFO = -14
       ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN
          INFO = -16
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -18
       END IF
 *
@@ -377,29 +380,33 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL CGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( 1,  N + INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, N + INT( WORK( 1 ) ) )
          CALL CUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
-         LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) )
          IF( ILVSL ) THEN
             CALL CUNGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1,
      $                   IERR )
-            LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) )
          END IF
          CALL CGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL,
      $                LDVSL, VSR, LDVSR, WORK, -1, IERR )
-         LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) )
          CALL CLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB,
      $                ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, WORK, -1,
      $                RWORK, 0, IERR )
-         LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
          IF( WANTST ) THEN
             CALL CTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB,
      $                   ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, SDIM,
      $                   PVSL, PVSR, DIF, WORK, -1, IDUM, 1, IERR )
-            LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+         END IF
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          END IF
-         WORK( 1 ) = CMPLX( LWKOPT )
       END IF
 
 *
@@ -422,7 +429,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -585,7 +591,7 @@
 *
    30 CONTINUE
 *
-      WORK( 1 ) = CMPLX( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cggev3.f b/lapack-netlib/SRC/cggev3.f
index 9483ecdeb..d2b75aebc 100644
--- a/lapack-netlib/SRC/cggev3.f
+++ b/lapack-netlib/SRC/cggev3.f
@@ -174,7 +174,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= MAX(1,2*N).
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -208,7 +209,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEeigen
+*> \ingroup ggev3
 *
 *  =====================================================================
       SUBROUTINE CGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA,
@@ -243,7 +244,7 @@
       CHARACTER          CHTEMP
       INTEGER            ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO,
      $                   IN, IRIGHT, IROWS, IRWRK, ITAU, IWRK, JC, JR,
-     $                   LWKOPT
+     $                   LWKOPT, LWKMIN
       REAL               ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS,
      $                   SMLNUM, TEMP
       COMPLEX            X
@@ -253,13 +254,12 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEQRF, CGGBAK, CGGBAL, CGGHD3, CLAQZ0, CLACPY,
-     $                   CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, SLABAD,
-     $                   XERBLA
+     $                   CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, AIMAG, MAX, REAL, SQRT
@@ -301,6 +301,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N )
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -315,7 +316,7 @@
          INFO = -11
       ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN
          INFO = -13
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -15
       END IF
 *
@@ -323,7 +324,7 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL CGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( N,  N+INT( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, N+INT( WORK( 1 ) ) )
          CALL CUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
          LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
@@ -348,7 +349,11 @@
      $                   RWORK, 0, IERR )
             LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
          END IF
-         WORK( 1 ) = CMPLX( LWKOPT )
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -368,7 +373,6 @@
       EPS = SLAMCH( 'E' )*SLAMCH( 'B' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -549,7 +553,7 @@
       IF( ILBSCL )
      $   CALL CLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR )
 *
-      WORK( 1 ) = CMPLX( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of CGGEV3
diff --git a/lapack-netlib/SRC/cgghd3.f b/lapack-netlib/SRC/cgghd3.f
index 1074b4828..f7175a72c 100644
--- a/lapack-netlib/SRC/cgghd3.f
+++ b/lapack-netlib/SRC/cgghd3.f
@@ -180,14 +180,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension (LWORK)
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in]  LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= 1.
+*>          The length of the array WORK. LWORK >= 1.
 *>          For optimum performance LWORK >= 6*N*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -212,7 +212,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup gghd3
 *
 *> \par Further Details:
 *  =====================
@@ -265,7 +265,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGGHRD, CLARTG, CLASET, CUNM22, CROT, CGEMM,
@@ -280,8 +281,13 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'CGGHD3', ' ', N, ILO, IHI, -1 )
-      LWKOPT = MAX( 6*N*NB, 1 )
-      WORK( 1 ) = CMPLX( LWKOPT )
+      NH = IHI - ILO + 1
+      IF( NH.LE.1 ) THEN
+         LWKOPT = 1
+      ELSE
+         LWKOPT = 6*N*NB
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       INITQ = LSAME( COMPQ, 'I' )
       WANTQ = INITQ .OR. LSAME( COMPQ, 'V' )
       INITZ = LSAME( COMPZ, 'I' )
@@ -330,7 +336,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = CONE
          RETURN
@@ -888,7 +893,8 @@
       IF ( JCOL.LT.IHI )
      $   CALL CGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q,
      $                LDQ, Z, LDZ, IERR )
-      WORK( 1 ) = CMPLX( LWKOPT )
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cggqrf.f b/lapack-netlib/SRC/cggqrf.f
index 29b0bf4af..309f170e8 100644
--- a/lapack-netlib/SRC/cggqrf.f
+++ b/lapack-netlib/SRC/cggqrf.f
@@ -251,8 +251,8 @@
       NB2 = ILAENV( 1, 'CGERQF', ' ', N, P, -1, -1 )
       NB3 = ILAENV( 1, 'CUNMQR', ' ', N, M, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P)*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
@@ -288,7 +288,7 @@
 *     RQ factorization of N-by-P matrix B: B = T*Z.
 *
       CALL CGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO )
-      WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) )
+      WORK( 1 ) = SROUNDUP_LWORK( MAX( LOPT, INT( WORK( 1 ) ) ) )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cggrqf.f b/lapack-netlib/SRC/cggrqf.f
index 273ab3ef7..8470a1ce2 100644
--- a/lapack-netlib/SRC/cggrqf.f
+++ b/lapack-netlib/SRC/cggrqf.f
@@ -250,8 +250,8 @@
       NB2 = ILAENV( 1, 'CGEQRF', ' ', P, N, -1, -1 )
       NB3 = ILAENV( 1, 'CUNMRQ', ' ', M, N, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P)*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -288,7 +288,7 @@
 *     QR factorization of P-by-N matrix B: B = Z*T
 *
       CALL CGEQRF( P, N, B, LDB, TAUB, WORK, LWORK, INFO )
-      WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) )
+      WORK( 1 ) = SROUNDUP_LWORK( MAX( LOPT, INT( WORK( 1 ) ) ) )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cggsvd3.f b/lapack-netlib/SRC/cggsvd3.f
index f248aebd5..4c4b85bae 100644
--- a/lapack-netlib/SRC/cggsvd3.f
+++ b/lapack-netlib/SRC/cggsvd3.f
@@ -278,7 +278,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -333,7 +333,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsing
+*> \ingroup ggsvd3
 *
 *> \par Contributors:
 *  ==================
diff --git a/lapack-netlib/SRC/cggsvp3.f b/lapack-netlib/SRC/cggsvp3.f
index 008a053a2..e19f7efd5 100644
--- a/lapack-netlib/SRC/cggsvp3.f
+++ b/lapack-netlib/SRC/cggsvp3.f
@@ -233,7 +233,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -256,7 +256,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup ggsvp3
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/cheevd.f b/lapack-netlib/SRC/cheevd.f
index b5ca804eb..9b62a2df6 100644
--- a/lapack-netlib/SRC/cheevd.f
+++ b/lapack-netlib/SRC/cheevd.f
@@ -116,8 +116,7 @@
 *>
 *> \param[out] RWORK
 *> \verbatim
-*>          RWORK is REAL array,
-*>                                         dimension (LRWORK)
+*>          RWORK is REAL array, dimension (MAX(1,LRWORK))
 *>          On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
 *> \endverbatim
 *>
@@ -282,8 +281,8 @@
             LROPT = LRWMIN
             LIOPT = LIWMIN
          END IF
-         WORK( 1 ) = SROUNDUP_LWORK(LOPT)
-         RWORK( 1 ) = LROPT
+         WORK( 1 ) = SROUNDUP_LWORK( LOPT )
+         RWORK( 1 ) = SROUNDUP_LWORK( LROPT )
          IWORK( 1 ) = LIOPT
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -378,8 +377,8 @@
          CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LOPT)
-      RWORK( 1 ) = LROPT
+      WORK( 1 ) = SROUNDUP_LWORK( LOPT )
+      RWORK( 1 ) = SROUNDUP_LWORK( LROPT )
       IWORK( 1 ) = LIOPT
 *
       RETURN
diff --git a/lapack-netlib/SRC/cheevr.f b/lapack-netlib/SRC/cheevr.f
index 05c5e66be..ad5c8cd4a 100644
--- a/lapack-netlib/SRC/cheevr.f
+++ b/lapack-netlib/SRC/cheevr.f
@@ -272,7 +272,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= max(1,2*N).
+*>          The length of the array WORK.
+*>          If N <= 1, LWORK >= 1, else LWORK >= 2*N.
 *>          For optimal efficiency, LWORK >= (NB+1)*N,
 *>          where NB is the max of the blocksize for CHETRD and for
 *>          CUNMTR as returned by ILAENV.
@@ -294,7 +295,8 @@
 *> \param[in] LRWORK
 *> \verbatim
 *>          LRWORK is INTEGER
-*>          The length of the array RWORK.  LRWORK >= max(1,24*N).
+*>          The length of the array RWORK.
+*>          If N <= 1, LRWORK >= 1, else LRWORK >= 24*N.
 *>
 *>          If LRWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -313,7 +315,8 @@
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -417,9 +420,15 @@
       LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR.
      $         ( LIWORK.EQ.-1 ) )
 *
-      LRWMIN = MAX( 1, 24*N )
-      LIWMIN = MAX( 1, 10*N )
-      LWMIN = MAX( 1, 2*N )
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LRWMIN = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = 2*N
+         LRWMIN = 24*N
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN
@@ -454,8 +463,8 @@
          NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 )
          NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) )
          LWKOPT = MAX( ( NB+1 )*N, LWMIN )
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
-         RWORK( 1 ) = LRWMIN
+         WORK( 1 )  = SROUNDUP_LWORK( LWKOPT )
+         RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN )
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -483,7 +492,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 2
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = REAL( A( 1, 1 ) )
@@ -710,8 +719,8 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
-      RWORK( 1 ) = LRWMIN
+      WORK( 1 )  = SROUNDUP_LWORK( LWKOPT )
+      RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN )
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/cheevr_2stage.f b/lapack-netlib/SRC/cheevr_2stage.f
index 0332a09bc..e06925fcd 100644
--- a/lapack-netlib/SRC/cheevr_2stage.f
+++ b/lapack-netlib/SRC/cheevr_2stage.f
@@ -265,7 +265,7 @@
 *>          indicating the nonzero elements in Z. The i-th eigenvector
 *>          is nonzero only in elements ISUPPZ( 2*i-1 ) through
 *>          ISUPPZ( 2*i ). This is an output of CSTEMR (tridiagonal
-*>          matrix). The support of the eigenvectors of A is typically 
+*>          matrix). The support of the eigenvectors of A is typically
 *>          1:N because of the unitary transformations applied by CUNMTR.
 *>          Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
 *> \endverbatim
@@ -279,12 +279,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  
+*>          The dimension of the array WORK.
+*>          If N <= 1,               LWORK must be at least 1.
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 26*N, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -310,7 +311,8 @@
 *> \param[in] LRWORK
 *> \verbatim
 *>          LRWORK is INTEGER
-*>          The length of the array RWORK.  LRWORK >= max(1,24*N).
+*>          The length of the array RWORK.
+*>          If N <= 1, LRWORK >= 1, else LRWORK >= 24*N.
 *>
 *>          If LRWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -329,7 +331,8 @@
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -354,7 +357,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup heevr_2stage
 *
 *> \par Contributors:
 *  ==================
@@ -382,7 +385,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -390,11 +393,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -443,8 +446,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV, ILAENV2STAGE
-      REAL               SLAMCH, CLANSY
-      EXTERNAL           LSAME, SLAMCH, CLANSY, ILAENV, ILAENV2STAGE
+      REAL               SLAMCH, CLANSY, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, CLANSY, ILAENV, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK 
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL,
@@ -472,9 +476,16 @@
       IB     = ILAENV2STAGE( 2, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
       LHTRD  = ILAENV2STAGE( 3, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
       LWTRD  = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
-      LWMIN  = N + LHTRD + LWTRD
-      LRWMIN = MAX( 1, 24*N )
-      LIWMIN = MAX( 1, 10*N )
+*
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LRWMIN = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = N + LHTRD + LWTRD
+         LRWMIN = 24*N
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
@@ -506,8 +517,8 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 )  = LWMIN
-         RWORK( 1 ) = LRWMIN
+         WORK( 1 )  = SROUNDUP_LWORK( LWMIN )
+         RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN )
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -535,7 +546,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 2
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = REAL( A( 1, 1 ) )
@@ -643,9 +654,9 @@
 *
 *     Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
 *
-      CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), 
+      CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ),
      $                    RWORK( INDRE ), WORK( INDTAU ),
-     $                    WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWK ), LLWORK, IINFO )
 *
 *     If all eigenvalues are desired
@@ -666,7 +677,7 @@
             CALL SCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 )
             CALL SCOPY( N, RWORK( INDRD ), 1, RWORK( INDRDD ), 1 )
 *
-            IF (ABSTOL .LE. TWO*N*EPS) THEN
+            IF ( ABSTOL .LE. TWO*N*EPS ) THEN
                TRYRAC = .TRUE.
             ELSE
                TRYRAC = .FALSE.
@@ -765,8 +776,8 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 )  = LWMIN
-      RWORK( 1 ) = LRWMIN
+      WORK( 1 )  = SROUNDUP_LWORK( LWMIN )
+      RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN )
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/cheevx.f b/lapack-netlib/SRC/cheevx.f
index e91599a44..a8a2bde63 100644
--- a/lapack-netlib/SRC/cheevx.f
+++ b/lapack-netlib/SRC/cheevx.f
@@ -348,14 +348,14 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWKMIN = 1
-            WORK( 1 ) = LWKMIN
+            LWKOPT = 1
          ELSE
             LWKMIN = 2*N
             NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 )
             NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) )
-            LWKOPT = MAX( 1, ( NB + 1 )*N )
-            WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+            LWKOPT = ( NB + 1 )*N
          END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY )
      $      INFO = -17
diff --git a/lapack-netlib/SRC/chesv_aa.f b/lapack-netlib/SRC/chesv_aa.f
index 53ecc0a16..0f41c9332 100644
--- a/lapack-netlib/SRC/chesv_aa.f
+++ b/lapack-netlib/SRC/chesv_aa.f
@@ -177,7 +177,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS
+      INTEGER            LWKMIN, LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -197,6 +197,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N, 3*N-2 )
       IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -207,18 +208,18 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX( 2*N, 3*N-2 ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-         LWKOPT_HETRF = INT( WORK(1) )
+         LWKOPT_HETRF = INT( WORK( 1 ) )
          CALL CHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
      $                   -1, INFO )
-         LWKOPT_HETRS = INT( WORK(1) )
-         LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS )
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         LWKOPT_HETRS = INT( WORK( 1 ) )
+         LWKOPT = MAX( LWKMIN, LWKOPT_HETRF, LWKOPT_HETRS )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -240,7 +241,7 @@
 *
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f
index 12950c4af..05ebd9253 100644
--- a/lapack-netlib/SRC/chesv_aa_2stage.f
+++ b/lapack-netlib/SRC/chesv_aa_2stage.f
@@ -99,14 +99,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is COMPLEX array, dimension (LTB)
+*>          TB is COMPLEX array, dimension (MAX(1,LTB)).
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -146,14 +146,15 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX workspace of size LWORK
+*>          WORK is COMPLEX workspace of size (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used to
+*>          select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -203,7 +204,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            UPPER, TQUERY, WQUERY
-      INTEGER            LWKOPT
+      INTEGER            LWKMIN, LWKOPT
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -225,6 +226,7 @@
       UPPER = LSAME( UPLO, 'U' )
       WQUERY = ( LWORK.EQ.-1 )
       TQUERY = ( LTB.EQ.-1 )
+      LWKMIN = MAX( 1, N )
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -233,18 +235,19 @@
          INFO = -3
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -5
-      ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -7
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -11
-      ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN
          INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV,
      $                          IPIV2, WORK, -1, INFO )
-         LWKOPT = INT( WORK(1) )
+         LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -254,7 +257,6 @@
          RETURN
       END IF
 *
-*
 *     Compute the factorization A = U**H*T*U or A = L*T*L**H.
 *
       CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2,
@@ -268,7 +270,7 @@
 *
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chesvx.f b/lapack-netlib/SRC/chesvx.f
index c23a35ce7..bdaad55ec 100644
--- a/lapack-netlib/SRC/chesvx.f
+++ b/lapack-netlib/SRC/chesvx.f
@@ -307,7 +307,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, NOFACT
-      INTEGER            LWKOPT, NB
+      INTEGER            LWKMIN, LWKOPT, NB
       REAL               ANORM
 *     ..
 *     .. External Functions ..
@@ -329,6 +329,7 @@
       INFO = 0
       NOFACT = LSAME( FACT, 'N' )
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N )
       IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN
          INFO = -1
       ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) )
@@ -346,17 +347,17 @@
          INFO = -11
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -13
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -18
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = MAX( 1, 2*N )
+         LWKOPT = LWKMIN
          IF( NOFACT ) THEN
             NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 )
             LWKOPT = MAX( LWKOPT, N*NB )
          END IF
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -405,7 +406,7 @@
       IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
      $   INFO = N + 1
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chetrd_2stage.f b/lapack-netlib/SRC/chetrd_2stage.f
index f5ad35f27..ec7075798 100644
--- a/lapack-netlib/SRC/chetrd_2stage.f
+++ b/lapack-netlib/SRC/chetrd_2stage.f
@@ -4,23 +4,23 @@
 *
 *  =========== DOCUMENTATION ===========
 *
-* Online html documentation available at 
-*            http://www.netlib.org/lapack/explore-html/ 
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
-*> Download CHETRD_2STAGE + dependencies 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chetrd_2stage.f"> 
-*> [TGZ]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chetrd_2stage.f"> 
-*> [ZIP]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chetrd_2stage.f"> 
+*> Download CHETRD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chetrd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chetrd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chetrd_2stage.f">
 *> [TXT]</a>
-*> \endhtmlonly 
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+*       SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
 *                                 HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
 *       IMPLICIT NONE
@@ -34,7 +34,7 @@
 *       COMPLEX            A( LDA, * ), TAU( * ),
 *                          HOUS2( * ), WORK( * )
 *       ..
-*  
+*
 *
 *> \par Purpose:
 *  =============
@@ -52,11 +52,11 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  in particular for the second stage (Band to
 *>                  tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate Q1 Q2 or to apply Q1 Q2, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate Q1 Q2 or to apply Q1 Q2,
 *>                  then LHOUS2 is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -86,7 +86,7 @@
 *>          triangular part of A is not referenced.
 *>          On exit, if UPLO = 'U', the band superdiagonal
 *>          of A are overwritten by the corresponding elements of the
-*>          internal band-diagonal matrix AB, and the elements above 
+*>          internal band-diagonal matrix AB, and the elements above
 *>          the KD superdiagonal, with the array TAU, represent the unitary
 *>          matrix Q1 as a product of elementary reflectors; if UPLO
 *>          = 'L', the diagonal and band subdiagonal of A are over-
@@ -117,13 +117,13 @@
 *> \param[out] TAU
 *> \verbatim
 *>          TAU is COMPLEX array, dimension (N-KD)
-*>          The scalar factors of the elementary reflectors of 
+*>          The scalar factors of the elementary reflectors of
 *>          the first stage (see Further Details).
 *> \endverbatim
 *>
 *> \param[out] HOUS2
 *> \verbatim
-*>          HOUS2 is COMPLEX array, dimension (LHOUS2)
+*>          HOUS2 is COMPLEX array, dimension (MAX(1,LHOUS2))
 *>          Stores the Householder representation of the stage2
 *>          band to tridiagonal.
 *> \endverbatim
@@ -132,6 +132,8 @@
 *> \verbatim
 *>          LHOUS2 is INTEGER
 *>          The dimension of the array HOUS2.
+*>          LHOUS2 >= 1.
+*>
 *>          If LWORK = -1, or LHOUS2=-1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS2 array, returns
@@ -143,13 +145,16 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension (LWORK)
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
+*>          The dimension of the array WORK.
+*>          If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
 *>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -157,9 +162,9 @@
 *>          message related to LWORK is issued by XERBLA.
 *>          LWORK = MAX(1, dimension) where
 *>          dimension   = max(stage1,stage2) + (KD+1)*N
-*>                      = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                        + max(2*KD*KD, KD*NTHREADS) 
-*>                        + (KD+1)*N 
+*>                      = N*KD + N*max(KD+1,FACTOPTNB)
+*>                        + max(2*KD*KD, KD*NTHREADS)
+*>                        + (KD+1)*N
 *>          where KD is the blocking size of the reduction,
 *>          FACTOPTNB is the blocking used by the QR or LQ
 *>          algorithm, usually FACTOPTNB=128 is a good choice
@@ -177,12 +182,12 @@
 *  Authors:
 *  ========
 *
-*> \author Univ. of Tennessee 
-*> \author Univ. of California Berkeley 
-*> \author Univ. of Colorado Denver 
-*> \author NAG Ltd. 
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetrd_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -202,7 +207,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -210,16 +215,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+      SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
      $                          HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
       IMPLICIT NONE
@@ -250,7 +255,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      EXTERNAL           LSAME, ILAENV2STAGE
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV2STAGE, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -265,10 +271,13 @@
 *
       KD     = ILAENV2STAGE( 1, 'CHETRD_2STAGE', VECT, N, -1, -1, -1 )
       IB     = ILAENV2STAGE( 2, 'CHETRD_2STAGE', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 )
-*      WRITE(*,*),'CHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
-*     $            LHMIN, LWMIN
+      IF( N.EQ.0 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.LSAME( VECT, 'N' ) ) THEN
          INFO = -1
@@ -285,8 +294,8 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         HOUS2( 1 ) = LHMIN
-         WORK( 1 )  = LWMIN
+         HOUS2( 1 ) = SROUNDUP_LWORK( LHMIN )
+         WORK( 1 )  = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -309,14 +318,14 @@
       LWRK  = LWORK-LDAB*N
       ABPOS = 1
       WPOS  = ABPOS + LDAB*N
-      CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, 
+      CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB,
      $                   TAU, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CHETRD_HE2HB', -INFO )
          RETURN
       END IF
-      CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, 
-     $                   WORK( ABPOS ), LDAB, D, E, 
+      CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD,
+     $                   WORK( ABPOS ), LDAB, D, E,
      $                   HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CHETRD_HB2ST', -INFO )
@@ -324,8 +333,7 @@
       END IF
 *
 *
-      HOUS2( 1 ) = LHMIN
-      WORK( 1 )  = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CHETRD_2STAGE
diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F
index 3688e40a3..b0d3e45fb 100644
--- a/lapack-netlib/SRC/chetrd_hb2st.F
+++ b/lapack-netlib/SRC/chetrd_hb2st.F
@@ -132,15 +132,17 @@
 *>
 *> \param[out] HOUS
 *> \verbatim
-*>          HOUS is COMPLEX array, dimension LHOUS, that
-*>          store the Householder representation.
+*>          HOUS is COMPLEX array, dimension (MAX(1,LHOUS))
+*>          Stores the Householder representation.
 *> \endverbatim
 *>
 *> \param[in] LHOUS
 *> \verbatim
 *>          LHOUS is INTEGER
-*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array HOUS.
+*>          If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS array, returns
 *>          this value as the first entry of the HOUS array, and no error
@@ -152,14 +154,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension LWORK.
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK)).
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array WORK.
+*>          If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -262,7 +267,7 @@
       INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
      $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
      $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
-     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
+     $                   NBTILES, TTYPE, TID, NTHREADS,
      $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
      $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
      $                   SICEV, SIZETAU, LDV, LHMIN, LWMIN
@@ -286,7 +291,6 @@
 *     Determine the minimal workspace size required.
 *     Test the input parameters
 *
-      DEBUG   = 0
       INFO    = 0
       AFTERS1 = LSAME( STAGE1, 'Y' )
       WANTQ   = LSAME( VECT, 'V' )
@@ -295,9 +299,14 @@
 *
 *     Determine the block size, the workspace size and the hous size.
 *
-      IB     = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
+      IB       = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 )
+      IF( N.EQ.0 .OR. KD.LE.1 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
          INFO = -1
@@ -318,8 +327,8 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         HOUS( 1 ) = LHMIN
-         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+         HOUS( 1 ) = SROUNDUP_LWORK( LHMIN )
+         WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -575,8 +584,7 @@ C                 END IF
   170     CONTINUE
       ENDIF
 *
-      HOUS( 1 ) = LHMIN
-      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CHETRD_HB2ST
diff --git a/lapack-netlib/SRC/chetrd_he2hb.f b/lapack-netlib/SRC/chetrd_he2hb.f
index 090f02100..42e71e0b2 100644
--- a/lapack-netlib/SRC/chetrd_he2hb.f
+++ b/lapack-netlib/SRC/chetrd_he2hb.f
@@ -123,8 +123,8 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension (LWORK)
-*>          On exit, if INFO = 0, or if LWORK=-1, 
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, or if LWORK = -1,
 *>          WORK(1) returns the size of LWORK.
 *> \endverbatim
 *>
@@ -132,7 +132,9 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK which should be calculated
-*>          by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*>          by a workspace query.
+*>          If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY).
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -294,8 +296,12 @@
       INFO   = 0
       UPPER  = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
-      LWMIN  = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 )
-      
+      IF( N.LE.KD+1 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 )
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -314,7 +320,7 @@
          CALL XERBLA( 'CHETRD_HE2HB', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+         WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
          RETURN
       END IF
 *
@@ -507,7 +513,7 @@
 
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CHETRD_HE2HB
diff --git a/lapack-netlib/SRC/chetrf.f b/lapack-netlib/SRC/chetrf.f
index 0c596ffe7..2836e30bc 100644
--- a/lapack-netlib/SRC/chetrf.f
+++ b/lapack-netlib/SRC/chetrf.f
@@ -107,7 +107,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK. LWORK >= 1. For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *> \endverbatim
 *>
@@ -228,8 +228,8 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         LWKOPT = MAX( 1, N*NB )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -347,7 +347,7 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of CHETRF
diff --git a/lapack-netlib/SRC/chetrf_aa.f b/lapack-netlib/SRC/chetrf_aa.f
index 0547a4eab..51410a6ed 100644
--- a/lapack-netlib/SRC/chetrf_aa.f
+++ b/lapack-netlib/SRC/chetrf_aa.f
@@ -101,8 +101,10 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >= 2*N. For optimum performance
-*>          LWORK >= N*(1+NB), where NB is the optimal blocksize.
+*>          The length of WORK.
+*>          LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise.
+*>          For optimum performance LWORK >= N*(1+NB), where NB is
+*>          the optimal blocksize, returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -128,7 +130,7 @@
 *> \ingroup hetrf_aa
 *
 *  =====================================================================
-      SUBROUTINE CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO)
+      SUBROUTINE CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -152,7 +154,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL      LQUERY, UPPER
-      INTEGER      J, LWKOPT
+      INTEGER      J, LWKMIN, LWKOPT
       INTEGER      NB, MJ, NJ, K1, K2, J1, J2, J3, JB
       COMPLEX      ALPHA
 *     ..
@@ -179,19 +181,26 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( N.LE.1 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = 2*N
+         LWKOPT = (NB+1)*N
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.( 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = (NB+1)*N
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -203,11 +212,11 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
           RETURN
       ENDIF
       IPIV( 1 ) = 1
-      IF ( N.EQ.1 ) THEN
+      IF( N.EQ.1 ) THEN
          A( 1, 1 ) = REAL( A( 1, 1 ) )
          RETURN
       END IF
@@ -460,7 +469,7 @@
       END IF
 *
    20 CONTINUE
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of CHETRF_AA
diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.f b/lapack-netlib/SRC/chetrf_aa_2stage.f
index 400efdf26..a79343753 100644
--- a/lapack-netlib/SRC/chetrf_aa_2stage.f
+++ b/lapack-netlib/SRC/chetrf_aa_2stage.f
@@ -87,14 +87,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is COMPLEX array, dimension (LTB)
+*>          TB is COMPLEX array, dimension (MAX(1,LTB))
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -121,14 +121,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX workspace of size LWORK
+*>          WORK is COMPLEX workspace of size (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used
+*>          to select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hetrf_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV,
@@ -188,7 +188,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
       
 *     ..
 *     .. External Subroutines ..
@@ -213,9 +214,9 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -6
-      ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN
          INFO = -10
       END IF
 *
@@ -229,10 +230,10 @@
       NB = ILAENV( 1, 'CHETRF_AA_2STAGE', UPLO, N, -1, -1, -1 )
       IF( INFO.EQ.0 ) THEN
          IF( TQUERY ) THEN
-            TB( 1 ) = (3*NB+1)*N
+            TB( 1 ) = SROUNDUP_LWORK( MAX( 1, (3*NB+1)*N ) )
          END IF
          IF( WQUERY ) THEN
-            WORK( 1 ) = N*NB
+            WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N*NB ) )
          END IF
       END IF
       IF( TQUERY .OR. WQUERY ) THEN
@@ -241,7 +242,7 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
          RETURN
       ENDIF
 *
diff --git a/lapack-netlib/SRC/chetrf_rk.f b/lapack-netlib/SRC/chetrf_rk.f
index ef442c937..a13c740e3 100644
--- a/lapack-netlib/SRC/chetrf_rk.f
+++ b/lapack-netlib/SRC/chetrf_rk.f
@@ -177,14 +177,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension ( MAX(1,LWORK) ).
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned
 *>          by ILAENV.
 *>
@@ -311,8 +311,8 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'CHETRF_RK', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         LWKOPT = MAX( 1, N*NB )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -488,7 +488,7 @@
 *
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of CHETRF_RK
diff --git a/lapack-netlib/SRC/chetrf_rook.f b/lapack-netlib/SRC/chetrf_rook.f
index 1593c2edc..df0323520 100644
--- a/lapack-netlib/SRC/chetrf_rook.f
+++ b/lapack-netlib/SRC/chetrf_rook.f
@@ -122,7 +122,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
@@ -264,7 +264,7 @@
 *
          NB = ILAENV( 1, 'CHETRF_ROOK', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, N*NB )
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -387,7 +387,7 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of CHETRF_ROOK
diff --git a/lapack-netlib/SRC/chetri2.f b/lapack-netlib/SRC/chetri2.f
index 2865a6440..f15065ae7 100644
--- a/lapack-netlib/SRC/chetri2.f
+++ b/lapack-netlib/SRC/chetri2.f
@@ -88,16 +88,16 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension (N+NB+1)*(NB+3)
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          WORK is size >= (N+NB+1)*(NB+3)
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>           calculates:
+*>          calculates:
 *>              - the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array,
 *>              - and no error message related to LWORK is issued by XERBLA.
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetri2
 *
 *  =====================================================================
       SUBROUTINE CHETRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -147,7 +147,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHETRI2X, CHETRI, XERBLA
@@ -159,9 +160,13 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+*
 *     Get blocksize
+*
       NBMAX = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 )
-      IF ( NBMAX .GE. N ) THEN
+      IF( N.EQ.0 ) THEN
+         MINSIZE = 1
+      ELSE IF( NBMAX.GE.N ) THEN
          MINSIZE = N
       ELSE
          MINSIZE = (N+NBMAX+1)*(NBMAX+3)
@@ -173,28 +178,29 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
-*
-*     Quick return if possible
-*
 *
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CHETRI2', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK(1)=MINSIZE
+         WORK( 1 ) = SROUNDUP_LWORK( MINSIZE )
          RETURN
       END IF
+*
+*     Quick return if possible
+*
       IF( N.EQ.0 )
      $   RETURN
 
-      IF( NBMAX .GE. N ) THEN
+      IF( NBMAX.GE.N ) THEN
          CALL CHETRI( UPLO, N, A, LDA, IPIV, WORK, INFO )
       ELSE
          CALL CHETRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO )
       END IF
+*
       RETURN
 *
 *     End of CHETRI2
diff --git a/lapack-netlib/SRC/chetri_3.f b/lapack-netlib/SRC/chetri_3.f
index deda63598..ccfce5070 100644
--- a/lapack-netlib/SRC/chetri_3.f
+++ b/lapack-netlib/SRC/chetri_3.f
@@ -119,16 +119,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension (N+NB+1)*(NB+3).
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK. LWORK >= (N+NB+1)*(NB+3).
+*>          The length of WORK.
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>
-*>          If LDWORK = -1, then a workspace query is assumed;
+*>          If LWORK = -1, then a workspace query is assumed;
 *>          the routine only calculates the optimal size of the optimal
 *>          size of the WORK array, returns this value as the first
 *>          entry of the WORK array, and no error message related to
@@ -209,8 +210,13 @@
 *
 *     Determine the block size
 *
-      NB = MAX( 1, ILAENV( 1, 'CHETRI_3', UPLO, N, -1, -1, -1 ) )
-      LWKOPT = ( N+NB+1 ) * ( NB+3 )
+      IF( N.EQ.0 ) THEN
+         LWKOPT = 1
+      ELSE
+         NB = MAX( 1, ILAENV( 1, 'CHETRI_3', UPLO, N, -1, -1, -1 ) )
+         LWKOPT = ( N+NB+1 ) * ( NB+3 )
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
@@ -218,7 +224,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN
          INFO = -8
       END IF
 *
@@ -226,7 +232,6 @@
          CALL XERBLA( 'CHETRI_3', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
@@ -237,7 +242,7 @@
 *
       CALL CHETRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO )
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chetrs_aa.f b/lapack-netlib/SRC/chetrs_aa.f
index 879549106..07179ab92 100644
--- a/lapack-netlib/SRC/chetrs_aa.f
+++ b/lapack-netlib/SRC/chetrs_aa.f
@@ -105,7 +105,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK >= max(1,3*N-2).
+*>          The dimension of the array WORK.
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the minimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -151,24 +157,30 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, UPPER
-      INTEGER            K, KP, LWKOPT
+      INTEGER            K, KP, LWKMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       REAL               SROUNDUP_LWORK
-      EXTERNAL           LSAME,SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLACPY, CLACGV, CGTSV, CSWAP, CTRSM, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX
+      INTRINSIC          MIN, MAX
 *     ..
 *     .. Executable Statements ..
 *
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWKMIN = 1
+      ELSE
+         LWKMIN = 3*N-2
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -179,21 +191,20 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CHETRS_AA', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         LWKOPT = (3*N-2)
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKMIN )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 .OR. NRHS.EQ.0 )
+      IF( MIN( N, NRHS ).EQ.0 )
      $   RETURN
 *
       IF( UPPER ) THEN
diff --git a/lapack-netlib/SRC/clamswlq.f b/lapack-netlib/SRC/clamswlq.f
index 5daf60bf6..8f474a3ab 100644
--- a/lapack-netlib/SRC/clamswlq.f
+++ b/lapack-netlib/SRC/clamswlq.f
@@ -127,17 +127,20 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          If SIDE = 'L', LWORK >= max(1,NB) * MB;
-*>          if SIDE = 'R', LWORK >= max(1,M) * MB.
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,NB*MB).
+*>          If SIDE = 'R', LWORK >= max(1,M*MB).
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -193,91 +196,100 @@
 *>
 *  =====================================================================
       SUBROUTINE CLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $    LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      COMPLEX           A( LDA, * ), WORK( * ), C(LDC, * ),
-     $      T( LDT, * )
+      COMPLEX            A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       REAL               SROUNDUP_LWORK
       EXTERNAL           LSAME, SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
-      EXTERNAL    CTPMLQT, CGEMLQT, XERBLA
+      EXTERNAL           CTPMLQT, CGEMLQT, XERBLA
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      INFO = 0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * MB
       ELSE
         LW = M * MB
       END IF
 *
-      INFO = 0
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
+*
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( K.LT.0 ) THEN
         INFO = -5
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
         INFO = -4
-      ELSE IF( K.LT.MB .OR. MB.LT.1) THEN
+      ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN
         INFO = -6
       ELSE IF( LDA.LT.MAX( 1, K ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, MB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
+      END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CLAMSWLQ', -INFO )
-        WORK(1) = SROUNDUP_LWORK(LW)
         RETURN
-      ELSE IF (LQUERY) THEN
-        WORK(1) = SROUNDUP_LWORK(LW)
+      ELSE IF( LQUERY ) THEN
         RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
       IF((NB.LE.K).OR.(NB.GE.MAX(M,N,K))) THEN
         CALL CGEMLQT( SIDE, TRANS, M, N, K, MB, A, LDA,
-     $        T, LDT, C, LDC, WORK, INFO)
+     $        T, LDT, C, LDC, WORK, INFO )
         RETURN
       END IF
 *
@@ -404,7 +416,7 @@
 *
       END IF
 *
-      WORK(1) = SROUNDUP_LWORK(LW)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CLAMSWLQ
diff --git a/lapack-netlib/SRC/clamtsqr.f b/lapack-netlib/SRC/clamtsqr.f
index 05021e642..13625087f 100644
--- a/lapack-netlib/SRC/clamtsqr.f
+++ b/lapack-netlib/SRC/clamtsqr.f
@@ -128,22 +128,24 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,N*NB).
+*>          If SIDE = 'R', LWORK >= max(1,MB*NB).
 *>
-*>          If SIDE = 'L', LWORK >= max(1,N)*NB;
-*>          if SIDE = 'R', LWORK >= max(1,MB)*NB.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -195,45 +197,47 @@
 *>
 *  =====================================================================
       SUBROUTINE CLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $        LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      COMPLEX        A( LDA, * ), WORK( * ), C(LDC, * ),
-     $                T( LDT, * )
+      COMPLEX            A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR, Q
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, Q, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       REAL               SROUNDUP_LWORK
       EXTERNAL           LSAME, SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
-      EXTERNAL   CGEMQRT, CTPMQRT, XERBLA
+      EXTERNAL           CGEMQRT, CTPMQRT, XERBLA
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      INFO = 0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * NB
         Q = M
       ELSE
@@ -241,11 +245,17 @@
         Q = N
       END IF
 *
-      INFO = 0
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
+*
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
@@ -256,38 +266,38 @@
         INFO = -7
       ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, NB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
-*     Determine the block size if it is tall skinny or short and wide
-*
-      IF( INFO.EQ.0)  THEN
-          WORK(1) = SROUNDUP_LWORK(LW)
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CLAMTSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
+*
+*     Determine the block size if it is tall skinny or short and wide
 *
       IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN
         CALL CGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA,
-     $        T, LDT, C, LDC, WORK, INFO)
+     $        T, LDT, C, LDC, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
       IF(LEFT.AND.NOTRAN) THEN
 *
@@ -412,7 +422,7 @@
 *
       END IF
 *
-      WORK(1) = SROUNDUP_LWORK(LW)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CLAMTSQR
diff --git a/lapack-netlib/SRC/claqp2rk.c b/lapack-netlib/SRC/claqp2rk.c
new file mode 100644
index 000000000..4184c5927
--- /dev/null
+++ b/lapack-netlib/SRC/claqp2rk.c
@@ -0,0 +1,943 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+
+/* Subroutine */ int claqp2rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *kmax, real *abstol, real *reltol, integer *kp1, 
+	real *maxc2nrm, complex *a, integer *lda, integer *k, real *maxc2nrmk,
+	 real *relmaxc2nrmk, integer *jpiv, complex *tau, real *vn1, real *
+	vn2, complex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    real r__1;
+    complex q__1;
+
+    /* Local variables */
+    complex aikk;
+    real temp, temp2;
+    integer i__, j;
+    real tol3z;
+    integer jmaxc2nrm;
+    extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex *
+	    , integer *, complex *, complex *, integer *, complex *), 
+	    cswap_(integer *, complex *, integer *, complex *, integer *);
+    integer itemp, minmnfact;
+    real myhugeval;
+    integer minmnupdt;
+    extern real scnrm2_(integer *, complex *, integer *);
+    integer kk, kp;
+    extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, 
+	    integer *, complex *);
+    extern real slamch_(char *);
+    extern integer isamax_(integer *, real *, integer *);
+    real taunan;
+    extern logical sisnan_(real *);
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/*     MINMNUPDT is the smallest dimension */
+/*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which */
+/*     contains the submatrices A(IOFFSET+1:M,1:N) and */
+/*     B(IOFFSET+1:M,1:NRHS) as column blocks. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *kmax = f2cmin(*kmax,minmnfact);
+    tol3z = sqrt(slamch_("Epsilon"));
+    myhugeval = slamch_("Overflow");
+
+/*     Compute the factorization, KK is the lomn loop index. */
+
+    i__1 = *kmax;
+    for (kk = 1; kk <= i__1; ++kk) {
+
+	i__ = *ioffset + kk;
+
+	if (i__ == 1) {
+
+/*           ============================================================ */
+
+/*           We are at the first column of the original whole matrix A, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+/*           ============================================================ */
+
+	} else {
+
+/*           ============================================================ */
+
+/*           Determine the pivot column in KK-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__2 = *n - kk + 1;
+	    kp = kk - 1 + isamax_(&i__2, &vn1[kk], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,KK:N) in step KK. */
+/*           RELMAXC2NRMK  will be computed later, after somecondition */
+/*           checks on MAXC2NRMK. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains NaN, and set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (sisnan_(maxc2nrmk)) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*info = *k + kp;
+
+/*               Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*               Array TAU(K+1:MINMNFACT) is not set and contains */
+/*               undefined elements. */
+
+		return 0;
+	    }
+
+/*           ============================================================ */
+
+/*           Quick return, if the submatrix A(I:M,KK:N) is */
+/*           a zero matrix. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*maxc2nrmk == 0.f) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*relmaxc2nrmk = 0.f;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    i__3 = j;
+		    tau[i__3].r = 0.f, tau[i__3].i = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + kk - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL >= ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+/*              Set K, the number of factorized columns. */
+
+		*k = kk - 1;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    i__3 = j;
+		    tau[i__3].r = 0.f, tau[i__3].i = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,KK:N): */
+/*        1) swap the KK-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) copy the KK-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than KK in the next loop step.) */
+/*        3) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A, not the block A(1:M,1:N). */
+
+	if (kp != kk) {
+	    cswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1);
+	    vn1[kp] = vn1[kk];
+	    vn2[kp] = vn2[kk];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[kk];
+	    jpiv[kk] = itemp;
+	}
+
+/*        Generate elementary reflector H(KK) using the column A(I:M,KK), */
+/*        if the column has more than one element, otherwise */
+/*        the elementary reflector would be an identity matrix, */
+/*        and TAU(KK) = CZERO. */
+
+	if (i__ < *m) {
+	    i__2 = *m - i__ + 1;
+	    clarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], &
+		    c__1, &tau[kk]);
+	} else {
+	    i__2 = kk;
+	    tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+	}
+
+/*        Check if TAU(KK) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(KK) for Inf, */
+/*        since CLARFG cannot produce TAU(KK) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by CLARFG can contain Inf, which requires */
+/*        TAU(KK) to contain NaN. Therefore, this case of generating Inf */
+/*        by CLARFG is covered by checking TAU(KK) for NaN. */
+
+	i__2 = kk;
+	r__1 = tau[i__2].r;
+	if (sisnan_(&r__1)) {
+	    i__2 = kk;
+	    taunan = tau[i__2].r;
+	} else /* if(complicated condition) */ {
+	    r__1 = r_imag(&tau[kk]);
+	    if (sisnan_(&r__1)) {
+		taunan = r_imag(&tau[kk]);
+	    } else {
+		taunan = 0.f;
+	    }
+	}
+
+	if (sisnan_(&taunan)) {
+	    *k = kk - 1;
+	    *info = kk;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = taunan;
+	    *relmaxc2nrmk = taunan;
+
+/*           Array TAU(KK:MINMNFACT) is not set and contains */
+/*           undefined elements, except the first element TAU(KK) = NaN. */
+
+	    return 0;
+	}
+
+/*        Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left. */
+/*        ( If M >= N, then at KK = N there is no residual matrix, */
+/*         i.e. no columns of A to update, only columns of B. */
+/*         If M < N, then at KK = M-IOFFSET, I = M and we have a */
+/*         one-row residual matrix in A and the elementary */
+/*         reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update */
+/*         is needed for the residual matrix in A and the */
+/*         right-hand-side-matrix in B. */
+/*         Therefore, we update only if */
+/*         KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */
+/*         condition is satisfied, not only KK < N+NRHS ) */
+
+	if (kk < minmnupdt) {
+	    i__2 = i__ + kk * a_dim1;
+	    aikk.r = a[i__2].r, aikk.i = a[i__2].i;
+	    i__2 = i__ + kk * a_dim1;
+	    a[i__2].r = 1.f, a[i__2].i = 0.f;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n + *nrhs - kk;
+	    r_cnjg(&q__1, &tau[kk]);
+	    clarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &q__1, 
+		    &a[i__ + (kk + 1) * a_dim1], lda, &work[1]);
+	    i__2 = i__ + kk * a_dim1;
+	    a[i__2].r = aikk.r, a[i__2].i = aikk.i;
+	}
+
+	if (kk < minmnfact) {
+
+/*           Update the partial column 2-norms for the residual matrix, */
+/*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */
+/*           when KK < f2cmin(M-IOFFSET, N). */
+
+	    i__2 = *n;
+	    for (j = kk + 1; j <= i__2; ++j) {
+		if (vn1[j] != 0.f) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+/* Computing 2nd power */
+		    r__1 = c_abs(&a[i__ + j * a_dim1]) / vn1[j];
+		    temp = 1.f - r__1 * r__1;
+		    temp = f2cmax(temp,0.f);
+/* Computing 2nd power */
+		    r__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (r__1 * r__1);
+		    if (temp2 <= tol3z) {
+
+/*                    Compute the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by explicitly computing it, */
+/*                    and store it in both partial 2-norm vector VN1 */
+/*                    and exact column 2-norm vector VN2. */
+
+			i__3 = *m - i__;
+			vn1[j] = scnrm2_(&i__3, &a[i__ + 1 + j * a_dim1], &
+				c__1);
+			vn2[j] = vn1[j];
+
+		    } else {
+
+/*                    Update the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by removing one */
+/*                    element A(I,J) and store it in partial */
+/*                    2-norm vector VN1. */
+
+			vn1[j] *= sqrt(temp);
+
+		    }
+		}
+	    }
+
+	}
+
+/*     End factorization loop */
+
+    }
+
+/*     If we reached this point, all colunms have been factorized, */
+/*     i.e. no condition was triggered to exit the routine. */
+/*     Set the number of factorized columns. */
+
+    *k = *kmax;
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */
+/*     we return. */
+
+    if (*k < minmnfact) {
+
+	i__1 = *n - *k;
+	jmaxc2nrm = *k + isamax_(&i__1, &vn1[*k + 1], &c__1);
+	*maxc2nrmk = vn1[jmaxc2nrm];
+
+	if (*k == 0) {
+	    *relmaxc2nrmk = 1.f;
+	} else {
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+	}
+
+    } else {
+	*maxc2nrmk = 0.f;
+	*relmaxc2nrmk = 0.f;
+    }
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, set TAUs corresponding to the columns that were */
+/*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO. */
+
+    i__1 = minmnfact;
+    for (j = *k + 1; j <= i__1; ++j) {
+	i__2 = j;
+	tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+    }
+
+    return 0;
+
+/*     End of CLAQP2RK */
+
+} /* claqp2rk_ */
+
diff --git a/lapack-netlib/SRC/claqp2rk.f b/lapack-netlib/SRC/claqp2rk.f
new file mode 100644
index 000000000..6b1db085a
--- /dev/null
+++ b/lapack-netlib/SRC/claqp2rk.f
@@ -0,0 +1,726 @@
+*> \brief \b CLAQP2RK computes truncated QR factorization with column pivoting of a complex matrix block using Level 2 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CLAQP2RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/claqp2rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/claqp2rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/claqp2rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE CLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+*     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+*     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+*     $                     INFO )
+*      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+*      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+*      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            JPIV( * )
+*      REAL               VN1( * ), VN2( * )
+*      COMPLEX            A( LDA, * ), TAU( * ), WORK( * )
+*     $
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLAQP2RK computes a truncated (rank K) or full rank Householder QR
+*> factorization with column pivoting of the complex matrix
+*> block A(IOFFSET+1:M,1:N) as
+*>
+*>   A * P(K) = Q(K) * R(K).
+*>
+*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N)
+*> is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides matrix block B
+*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**H * B.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M-IOFFSET,N), then this stopping
+*>                criterion is not used, factorize columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>             satisfied on input and the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL, cannot be NaN.
+*>
+*>          The second factorization stopping criterion.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is REAL, cannot be NaN.
+*>
+*>          The third factorization stopping criterion.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine CGEQP3RK. 1 <= KP1 <= N_orig_mat.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is REAL
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine CGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:K) below
+*>             the diagonal together with the array TAU represent
+*>             the unitary matrix Q(K) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:K) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,K+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(K)**H.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is REAL
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is REAL
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is REAL array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is REAL array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (N-1)
+*>          Used in CLARF subroutine to apply an elementary
+*>          reflector from the left.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp2rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+     $                     INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            JPIV( * )
+      REAL               VN1( * ), VN2( * )
+      COMPLEX            A( LDA, * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+      COMPLEX            CZERO, CONE
+      PARAMETER          ( CZERO = ( 0.0E+0, 0.0E+0 ),
+     $                   CONE = ( 1.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT,
+     $                   MINMNUPDT
+      REAL               HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z
+      COMPLEX            AIKK
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CLARF, CLARFG, CSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, REAL, CONJG, IMAG, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      INTEGER            ISAMAX
+      REAL               SLAMCH, SCNRM2
+      EXTERNAL           SISNAN, SLAMCH, ISAMAX, SCNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+*     MINMNUPDT is the smallest dimension
+*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which
+*     contains the submatrices A(IOFFSET+1:M,1:N) and
+*     B(IOFFSET+1:M,1:NRHS) as column blocks.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      KMAX = MIN( KMAX, MINMNFACT )
+      TOL3Z = SQRT( SLAMCH( 'Epsilon' ) )
+      HUGEVAL = SLAMCH( 'Overflow' )
+*
+*     Compute the factorization, KK is the lomn loop index.
+*
+      DO KK = 1, KMAX
+*
+         I = IOFFSET + KK
+*
+         IF( I.EQ.1 ) THEN
+*
+*           ============================================================
+*
+*           We are at the first column of the original whole matrix A,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+            KP = KP1
+*
+*           ============================================================
+*
+         ELSE
+*
+*           ============================================================
+*
+*           Determine the pivot column in KK-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( KK-1 ) + ISAMAX( N-KK+1, VN1( KK ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,KK:N) in step KK.
+*           RELMAXC2NRMK  will be computed later, after somecondition
+*           checks on MAXC2NRMK.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains NaN, and set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( SISNAN( MAXC2NRMK ) ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+                K = KK - 1
+                INFO = K + KP
+*
+*               Set RELMAXC2NRMK to NaN.
+*
+                RELMAXC2NRMK = MAXC2NRMK
+*
+*               Array TAU(K+1:MINMNFACT) is not set and contains
+*               undefined elements.
+*
+               RETURN
+            END IF
+*
+*           ============================================================
+*
+*           Quick return, if the submatrix A(I:M,KK:N) is
+*           a zero matrix.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+               K = KK - 1
+               RELMAXC2NRMK = ZERO
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + KK - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third stopping criteria.
+*           NOTE: There is no need to test for ABSTOL >= ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+*              Set K, the number of factorized columns.
+*
+               K = KK - 1
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,KK:N):
+*        1) swap the KK-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) copy the KK-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than KK in the next loop step.)
+*        3) Save the pivot interchange with the indices relative to the
+*           the original matrix A, not the block A(1:M,1:N).
+*
+         IF( KP.NE.KK ) THEN
+            CALL CSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 )
+            VN1( KP ) = VN1( KK )
+            VN2( KP ) = VN2( KK )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( KK )
+            JPIV( KK ) = ITEMP
+         END IF
+*
+*        Generate elementary reflector H(KK) using the column A(I:M,KK),
+*        if the column has more than one element, otherwise
+*        the elementary reflector would be an identity matrix,
+*        and TAU(KK) = CZERO.
+*
+         IF( I.LT.M ) THEN
+            CALL CLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1,
+     $                   TAU( KK ) )
+         ELSE
+            TAU( KK ) = CZERO
+         END IF
+*
+*        Check if TAU(KK) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(KK) for Inf,
+*        since CLARFG cannot produce TAU(KK) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by CLARFG can contain Inf, which requires
+*        TAU(KK) to contain NaN. Therefore, this case of generating Inf
+*        by CLARFG is covered by checking TAU(KK) for NaN.
+*
+         IF( SISNAN( REAL( TAU(KK) ) ) ) THEN
+            TAUNAN = REAL( TAU(KK) )
+         ELSE IF( SISNAN( IMAG( TAU(KK) ) ) ) THEN
+            TAUNAN = IMAG( TAU(KK) )
+         ELSE
+            TAUNAN = ZERO
+         END IF
+*
+         IF( SISNAN( TAUNAN ) ) THEN
+            K = KK - 1
+            INFO = KK
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAUNAN
+            RELMAXC2NRMK = TAUNAN
+*
+*           Array TAU(KK:MINMNFACT) is not set and contains
+*           undefined elements, except the first element TAU(KK) = NaN.
+*
+            RETURN
+         END IF
+*
+*        Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left.
+*        ( If M >= N, then at KK = N there is no residual matrix,
+*         i.e. no columns of A to update, only columns of B.
+*         If M < N, then at KK = M-IOFFSET, I = M and we have a
+*         one-row residual matrix in A and the elementary
+*         reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update
+*         is needed for the residual matrix in A and the
+*         right-hand-side-matrix in B.
+*         Therefore, we update only if
+*         KK < MINMNUPDT = min(M-IOFFSET, N+NRHS)
+*         condition is satisfied, not only KK < N+NRHS )
+*
+         IF( KK.LT.MINMNUPDT ) THEN
+            AIKK = A( I, KK )
+            A( I, KK ) = CONE
+            CALL CLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1,
+     $                  CONJG( TAU( KK ) ), A( I, KK+1 ), LDA,
+     $                  WORK( 1 ) )
+            A( I, KK ) = AIKK
+         END IF
+*
+         IF( KK.LT.MINMNFACT ) THEN
+*
+*           Update the partial column 2-norms for the residual matrix,
+*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e.
+*           when KK < min(M-IOFFSET, N).
+*
+            DO J = KK + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2
+                  TEMP = MAX( TEMP, ZERO )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2 .LE. TOL3Z ) THEN
+*
+*                    Compute the column 2-norm for the partial
+*                    column A(I+1:M,J) by explicitly computing it,
+*                    and store it in both partial 2-norm vector VN1
+*                    and exact column 2-norm vector VN2.
+*
+                     VN1( J ) = SCNRM2( M-I, A( I+1, J ), 1 )
+                     VN2( J ) = VN1( J )
+*
+                  ELSE
+*
+*                    Update the column 2-norm for the partial
+*                    column A(I+1:M,J) by removing one
+*                    element A(I,J) and store it in partial
+*                    2-norm vector VN1.
+*
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+*
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*     End factorization loop
+*
+      END DO
+*
+*     If we reached this point, all colunms have been factorized,
+*     i.e. no condition was triggered to exit the routine.
+*     Set the number of factorized columns.
+*
+      K = KMAX
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before
+*     we return.
+*
+      IF( K.LT.MINMNFACT ) THEN
+*
+         JMAXC2NRM = K + ISAMAX( N-K, VN1( K+1 ), 1 )
+         MAXC2NRMK = VN1( JMAXC2NRM )
+*
+         IF( K.EQ.0 ) THEN
+            RELMAXC2NRMK = ONE
+         ELSE
+            RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+         END IF
+*
+      ELSE
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+      END IF
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, set TAUs corresponding to the columns that were
+*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO.
+*
+      DO J = K + 1, MINMNFACT
+         TAU( J ) = CZERO
+      END DO
+*
+      RETURN
+*
+*     End of CLAQP2RK
+*
+      END
diff --git a/lapack-netlib/SRC/claqp3rk.c b/lapack-netlib/SRC/claqp3rk.c
new file mode 100644
index 000000000..ca305fab7
--- /dev/null
+++ b/lapack-netlib/SRC/claqp3rk.c
@@ -0,0 +1,1152 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static complex c_b1 = {0.f,0.f};
+static complex c_b2 = {1.f,0.f};
+static integer c__1 = 1;
+
+/* Subroutine */ int claqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *nb, real *abstol, real *reltol, integer *kp1, real 
+	*maxc2nrm, complex *a, integer *lda, logical *done, integer *kb, real 
+	*maxc2nrmk, real *relmaxc2nrmk, integer *jpiv, complex *tau, real *
+	vn1, real *vn2, complex *auxv, complex *f, integer *ldf, integer *
+	iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, f_dim1, f_offset, i__1, i__2, i__3;
+    real r__1, r__2;
+    complex q__1;
+
+    /* Local variables */
+    real temp, temp2;
+    integer i__, j, k;
+    real tol3z;
+    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *), cgemv_(char *, 
+	    integer *, integer *, complex *, complex *, integer *, complex *, 
+	    integer *, complex *, complex *, integer *), cswap_(
+	    integer *, complex *, integer *, complex *, integer *);
+    integer itemp, minmnfact;
+    real myhugeval;
+    integer minmnupdt;
+    extern real scnrm2_(integer *, complex *, integer *);
+    integer if__, kp;
+    extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, 
+	    integer *, complex *);
+    extern real slamch_(char *);
+    integer lsticc;
+    extern integer isamax_(integer *, real *, integer *);
+    real taunan;
+    extern logical sisnan_(real *);
+    complex aik;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --auxv;
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *nb = f2cmin(*nb,minmnfact);
+    tol3z = sqrt(slamch_("Epsilon"));
+    myhugeval = slamch_("Overflow");
+
+/*     Compute factorization in a while loop over NB columns, */
+/*     K is the column index in the block A(1:M,1:N). */
+
+    k = 0;
+    lsticc = 0;
+    *done = FALSE_;
+
+    while(k < *nb && lsticc == 0) {
+	++k;
+	i__ = *ioffset + k;
+
+	if (i__ == 1) {
+
+/*           We are at the first column of the original whole matrix A_orig, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+	} else {
+
+/*           Determine the pivot column in K-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__1 = *n - k + 1;
+	    kp = k - 1 + isamax_(&i__1, &vn1[k], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,K:N) in step K. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains NaN, set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (sisnan_(maxc2nrmk)) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*info = *kb + kp;
+
+/*              Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix contains NaN and we stop */
+/*              the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    q__1.r = -1.f, q__1.i = 0.f;
+		    cgemm_("No transpose", "Conjugate transpose", &i__1, nrhs,
+			     kb, &q__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 
+			    + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*              undefined elements. */
+
+/*              Return from the routine. */
+
+		return 0;
+	    }
+
+/*           Quick return, if the submatrix A(I:M,K:N) is */
+/*           a zero matrix. We need to check it only if the column index */
+/*           (same as row index) is larger than 1, since the condition */
+/*           for the whole original matrix A_orig is checked in the main */
+/*           routine. */
+
+	    if (*maxc2nrmk == 0.f) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*relmaxc2nrmk = 0.f;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix is zero and we stop the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */
+
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    q__1.r = -1.f, q__1.i = 0.f;
+		    cgemm_("No transpose", "Conjugate transpose", &i__1, nrhs,
+			     kb, &q__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 
+			    + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    i__2 = j;
+		    tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + k - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third tolerance stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig; */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+
+/*              Apply the block reflector to the residual of the */
+/*              matrix A and the residual of the right hand sides B, if */
+/*              the residual matrix and and/or the residual of the right */
+/*              hand sides exist,  i.e. if the submatrix */
+/*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when */
+/*                 KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */
+
+		if (*kb < minmnupdt) {
+		    i__1 = *m - if__;
+		    i__2 = *n + *nrhs - *kb;
+		    q__1.r = -1.f, q__1.i = 0.f;
+		    cgemm_("No transpose", "Conjugate transpose", &i__1, &
+			    i__2, kb, &q__1, &a[if__ + 1 + a_dim1], lda, &f[*
+			    kb + 1 + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*kb 
+			    + 1) * a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    i__2 = j;
+		    tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,K:N): */
+/*        1) swap the K-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */
+/*        3) copy the K-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than K in the next loop step.) */
+/*        4) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A_orig, not the block A(1:M,1:N). */
+
+	if (kp != k) {
+	    cswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+	    i__1 = k - 1;
+	    cswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf);
+	    vn1[kp] = vn1[k];
+	    vn2[kp] = vn2[k];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[k];
+	    jpiv[k] = itemp;
+	}
+
+/*        Apply previous Householder reflectors to column K: */
+/*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H. */
+
+	if (k > 1) {
+	    i__1 = k - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = k + j * f_dim1;
+		r_cnjg(&q__1, &f[k + j * f_dim1]);
+		f[i__2].r = q__1.r, f[i__2].i = q__1.i;
+	    }
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    q__1.r = -1.f, q__1.i = 0.f;
+	    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[i__ + a_dim1], lda,
+		     &f[k + f_dim1], ldf, &c_b2, &a[i__ + k * a_dim1], &c__1);
+	    i__1 = k - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = k + j * f_dim1;
+		r_cnjg(&q__1, &f[k + j * f_dim1]);
+		f[i__2].r = q__1.r, f[i__2].i = q__1.i;
+	    }
+	}
+
+/*        Generate elementary reflector H(k) using the column A(I:M,K). */
+
+	if (i__ < *m) {
+	    i__1 = *m - i__ + 1;
+	    clarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], &
+		    c__1, &tau[k]);
+	} else {
+	    i__1 = k;
+	    tau[i__1].r = 0.f, tau[i__1].i = 0.f;
+	}
+
+/*        Check if TAU(K) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(K) for Inf, */
+/*        since CLARFG cannot produce TAU(KK) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by CLARFG can contain Inf, which requires */
+/*        TAU(K) to contain NaN. Therefore, this case of generating Inf */
+/*        by CLARFG is covered by checking TAU(K) for NaN. */
+
+	i__1 = k;
+	r__1 = tau[i__1].r;
+	if (sisnan_(&r__1)) {
+	    i__1 = k;
+	    taunan = tau[i__1].r;
+	} else /* if(complicated condition) */ {
+	    r__1 = r_imag(&tau[k]);
+	    if (sisnan_(&r__1)) {
+		taunan = r_imag(&tau[k]);
+	    } else {
+		taunan = 0.f;
+	    }
+	}
+
+	if (sisnan_(&taunan)) {
+
+	    *done = TRUE_;
+
+/*           Set KB, the number of factorized partial columns */
+/*                   that are non-zero in each step in the block, */
+/*                   i.e. the rank of the factor R. */
+/*           Set IF, the number of processed rows in the block, which */
+/*                   is the same as the number of processed rows in */
+/*                   the original whole matrix A_orig. */
+
+	    *kb = k - 1;
+	    if__ = i__ - 1;
+	    *info = k;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = taunan;
+	    *relmaxc2nrmk = taunan;
+
+/*           There is no need to apply the block reflector to the */
+/*           residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*           since the submatrix contains NaN and we stop */
+/*           the computation. */
+/*           But, we need to apply the block reflector to the residual */
+/*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*           residual right hand sides exist.  This occurs */
+/*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */
+
+	    if (*nrhs > 0 && *kb < *m - *ioffset) {
+		i__1 = *m - if__;
+		q__1.r = -1.f, q__1.i = 0.f;
+		cgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, kb,
+			 &q__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + 
+			f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * a_dim1],
+			 lda);
+	    }
+
+/*           There is no need to recompute the 2-norm of the */
+/*           difficult columns, since we stop the factorization. */
+
+/*           Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*           undefined elements. */
+
+/*           Return from the routine. */
+
+	    return 0;
+	}
+
+/*        =============================================================== */
+
+	i__1 = i__ + k * a_dim1;
+	aik.r = a[i__1].r, aik.i = a[i__1].i;
+	i__1 = i__ + k * a_dim1;
+	a[i__1].r = 1.f, a[i__1].i = 0.f;
+
+/*        =============================================================== */
+
+/*        Compute the current K-th column of F: */
+/*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K). */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = *n + *nrhs - k;
+	    cgemv_("Conjugate transpose", &i__1, &i__2, &tau[k], &a[i__ + (k 
+		    + 1) * a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, &
+		    f[k + 1 + k * f_dim1], &c__1);
+	}
+
+/*           2) Zero out elements above and on the diagonal of the */
+/*              column K in matrix F, i.e elements F(1:K,K). */
+
+	i__1 = k;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j + k * f_dim1;
+	    f[i__2].r = 0.f, f[i__2].i = 0.f;
+	}
+
+/*         3) Incremental updating of the K-th column of F: */
+/*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H */
+/*                    * A(I:M,K). */
+
+	if (k > 1) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    i__3 = k;
+	    q__1.r = -tau[i__3].r, q__1.i = -tau[i__3].i;
+	    cgemv_("Conjugate Transpose", &i__1, &i__2, &q__1, &a[i__ + 
+		    a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, &auxv[1]
+		    , &c__1);
+
+	    i__1 = *n + *nrhs;
+	    i__2 = k - 1;
+	    cgemv_("No transpose", &i__1, &i__2, &c_b2, &f[f_dim1 + 1], ldf, &
+		    auxv[1], &c__1, &c_b2, &f[k * f_dim1 + 1], &c__1);
+	}
+
+/*        =============================================================== */
+
+/*        Update the current I-th row of A: */
+/*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */
+/*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**H. */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *n + *nrhs - k;
+	    q__1.r = -1.f, q__1.i = 0.f;
+	    cgemm_("No transpose", "Conjugate transpose", &c__1, &i__1, &k, &
+		    q__1, &a[i__ + a_dim1], lda, &f[k + 1 + f_dim1], ldf, &
+		    c_b2, &a[i__ + (k + 1) * a_dim1], lda);
+	}
+
+	i__1 = i__ + k * a_dim1;
+	a[i__1].r = aik.r, a[i__1].i = aik.i;
+
+/*        Update the partial column 2-norms for the residual matrix, */
+/*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */
+/*        when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */
+
+	if (k < minmnfact) {
+
+	    i__1 = *n;
+	    for (j = k + 1; j <= i__1; ++j) {
+		if (vn1[j] != 0.f) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+		    temp = c_abs(&a[i__ + j * a_dim1]) / vn1[j];
+/* Computing MAX */
+		    r__1 = 0.f, r__2 = (temp + 1.f) * (1.f - temp);
+		    temp = f2cmax(r__1,r__2);
+/* Computing 2nd power */
+		    r__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (r__1 * r__1);
+		    if (temp2 <= tol3z) {
+
+/*                    At J-index, we have a difficult column for the */
+/*                    update of the 2-norm. Save the index of the previous */
+/*                    difficult column in IWORK(J-1). */
+/*                    NOTE: ILSTCC > 1, threfore we can use IWORK only */
+/*                    with N-1 elements, where the elements are */
+/*                    shifted by 1 to the left. */
+
+			iwork[j - 1] = lsticc;
+
+/*                    Set the index of the last difficult column LSTICC. */
+
+			lsticc = j;
+
+		    } else {
+			vn1[j] *= sqrt(temp);
+		    }
+		}
+	    }
+
+	}
+
+/*        End of while loop. */
+
+    }
+
+/*     Now, afler the loop: */
+/*        Set KB, the number of factorized columns in the block; */
+/*        Set IF, the number of processed rows in the block, which */
+/*                is the same as the number of processed rows in */
+/*                the original whole matrix A_orig, IF = IOFFSET + KB. */
+
+    *kb = k;
+    if__ = i__;
+
+/*     Apply the block reflector to the residual of the matrix A */
+/*     and the residual of the right hand sides B, if the residual */
+/*     matrix and and/or the residual of the right hand sides */
+/*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */
+/*     This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */
+
+    if (*kb < minmnupdt) {
+	i__1 = *m - if__;
+	i__2 = *n + *nrhs - *kb;
+	q__1.r = -1.f, q__1.i = 0.f;
+	cgemm_("No transpose", "Conjugate transpose", &i__1, &i__2, kb, &q__1,
+		 &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b2,
+		 &a[if__ + 1 + (*kb + 1) * a_dim1], lda);
+    }
+
+/*     Recompute the 2-norm of the difficult columns. */
+/*     Loop over the index of the difficult columns from the largest */
+/*     to the smallest index. */
+
+    while(lsticc > 0) {
+
+/*        LSTICC is the index of the last difficult column is greater */
+/*        than 1. */
+/*        ITEMP is the index of the previous difficult column. */
+
+	itemp = iwork[lsticc - 1];
+
+/*        Compute the 2-norm explicilty for the last difficult column and */
+/*        save it in the partial and exact 2-norm vectors VN1 and VN2. */
+
+/*        NOTE: The computation of VN1( LSTICC ) relies on the fact that */
+/*        SCNRM2 does not fail on vectors with norm below the value of */
+/*        SQRT(SLAMCH('S')) */
+
+	i__1 = *m - if__;
+	vn1[lsticc] = scnrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1);
+	vn2[lsticc] = vn1[lsticc];
+
+/*        Downdate the index of the last difficult column to */
+/*        the index of the previous difficult column. */
+
+	lsticc = itemp;
+
+    }
+
+    return 0;
+
+/*     End of CLAQP3RK */
+
+} /* claqp3rk_ */
+
diff --git a/lapack-netlib/SRC/claqp3rk.f b/lapack-netlib/SRC/claqp3rk.f
new file mode 100644
index 000000000..3703bcbd6
--- /dev/null
+++ b/lapack-netlib/SRC/claqp3rk.f
@@ -0,0 +1,947 @@
+*> \brief \b CLAQP3RK computes a step of truncated QR factorization with column pivoting of a complex m-by-n matrix A using Level 3 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CLAQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/claqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/claqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/claqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE CLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+*     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+*     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+*      IMPLICIT NONE
+*      LOGICAL            DONE
+*      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+*     $                   NB, NRHS
+*      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            IWORK( * ), JPIV( * )
+*      REAL               VN1( * ), VN2( * )
+*      COMPLEX*16         A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLAQP3RK computes a step of truncated QR factorization with column
+*> pivoting of a complex M-by-N matrix A block A(IOFFSET+1:M,1:N)
+*> by using Level 3 BLAS as
+*>
+*>   A * P(KB) = Q(KB) * R(KB).
+*>
+*> The routine tries to factorize NB columns from A starting from
+*> the row IOFFSET+1 and updates the residual matrix with BLAS 3
+*> xGEMM. The number of actually factorized columns is returned
+*> is smaller than NB.
+*>
+*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides B matrix stored
+*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**H * B.
+*>
+*> Cases when the number of factorized columns KB < NB:
+*>
+*> (1) In some cases, due to catastrophic cancellations, it cannot
+*> factorize all NB columns and need to update the residual matrix.
+*> Hence, the actual number of factorized columns in the block returned
+*> in KB is smaller than NB. The logical DONE is returned as FALSE.
+*> The factorization of the whole original matrix A_orig must proceed
+*> with the next block.
+*>
+*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB.
+*>
+*> (3) In case both stopping criteria ABSTOL or RELTOL are not used,
+*> and when the residual matrix is a zero matrix in some factorization
+*> step KB, the factorization of the whole original matrix A_orig is
+*> stopped, the logical DONE is returned as TRUE. The number of
+*> factorized columns which is smaller than NB is returned in KB.
+*>
+*> (4) Whenever NaN is detected in the matrix A or in the array TAU,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB. The INFO
+*> parameter is set to the column index of the first NaN occurrence.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          Factorization block size, i.e the number of columns
+*>          to factorize in the matrix A. 0 <= NB
+*>
+*>          If NB = 0, then the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is REAL, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine CGEQP3RK. 1 <= KP1 <= N_orig.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is REAL
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine CGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:KB) below
+*>             the diagonal together with the array TAU represent
+*>             the unitary matrix Q(KB) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:KB) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,KB+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(KB)**H.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out]
+*> \verbatim
+*>          DONE is LOGICAL
+*>          TRUE: a) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to ABSTOL
+*>                   or RELTOL criterion,
+*>                b) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to the
+*>                   residual matrix being a ZERO matrix.
+*>                c) when NaN was detected in the matrix A
+*>                   or in the array TAU.
+*>          FALSE: otherwise.
+*> \endverbatim
+*>
+*> \param[out] KB
+*> \verbatim
+*>          KB is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R.  0 <= KB <= min(M-IOFFSET,NB,N).
+*>
+*>          KB also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is REAL
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank KB. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is REAL
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank KB) to the maximum column 2-norm of the
+*>          original matrix A_orig. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is REAL array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is REAL array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] AUXV
+*> \verbatim
+*>          AUXV is COMPLEX array, dimension (NB)
+*>          Auxiliary vector.
+*> \endverbatim
+*>
+*> \param[out] F
+*> \verbatim
+*>          F is COMPLEX array, dimension (LDF,NB)
+*>          Matrix F**H = L*(Y**H)*A.
+*> \endverbatim
+*>
+*> \param[in] LDF
+*> \verbatim
+*>          LDF is INTEGER
+*>          The leading dimension of the array F. LDF >= max(1,N+NRHS).
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step KB+1 ( when KB columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             KB                  is set to the number of
+*>                                    factorized columns without
+*>                                    exception.
+*>             MAXC2NRMK           is set to NaN.
+*>             RELMAXC2NRMK        is set to NaN.
+*>             TAU(KB+1:min(M,N))     is not set and contains undefined
+*>                                    elements. If j_1=KB+1, TAU(KB+1)
+*>                                    may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the actorization
+*>             step KB+1 ( when KB columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp3rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      LOGICAL            DONE
+      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+     $                   NB, NRHS
+      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      REAL               VN1( * ), VN2( * )
+      COMPLEX            A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+      COMPLEX            CZERO, CONE
+      PARAMETER          ( CZERO = ( 0.0E+0, 0.0E+0 ),
+     $                   CONE = ( 1.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            ITEMP, J, K, MINMNFACT, MINMNUPDT,
+     $                   LSTICC, KP, I, IF
+      REAL               HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z
+      COMPLEX            AIK
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CGEMM, CGEMV, CLARFG, CSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, REAL, CONJG, IMAG, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      INTEGER            ISAMAX
+      REAL               SLAMCH, SCNRM2
+      EXTERNAL           SISNAN, SLAMCH, ISAMAX, SCNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      NB = MIN( NB, MINMNFACT )
+      TOL3Z = SQRT( SLAMCH( 'Epsilon' ) )
+      HUGEVAL = SLAMCH( 'Overflow' )
+*
+*     Compute factorization in a while loop over NB columns,
+*     K is the column index in the block A(1:M,1:N).
+*
+      K = 0
+      LSTICC = 0
+      DONE = .FALSE.
+*
+      DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 )
+         K = K + 1
+         I = IOFFSET + K
+*
+         IF( I.EQ.1 ) THEN
+*
+*           We are at the first column of the original whole matrix A_orig,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+            KP = KP1
+*
+         ELSE
+*
+*           Determine the pivot column in K-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( K-1 ) + ISAMAX( N-K+1, VN1( K ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,K:N) in step K.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains NaN, set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( SISNAN( MAXC2NRMK ) ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               INFO = KB + KP
+*
+*              Set RELMAXC2NRMK to NaN.
+*
+               RELMAXC2NRMK = MAXC2NRMK
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix contains NaN and we stop
+*              the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H.
+
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $                  M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Array TAU(KF+1:MINMNFACT) is not set and contains
+*              undefined elements.
+*
+*              Return from the routine.
+*
+               RETURN
+            END IF
+*
+*           Quick return, if the submatrix A(I:M,K:N) is
+*           a zero matrix. We need to check it only if the column index
+*           (same as row index) is larger than 1, since the condition
+*           for the whole original matrix A_orig is checked in the main
+*           routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               RELMAXC2NRMK = ZERO
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix is zero and we stop the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H.
+*
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $                  M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + K - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third tolerance stopping criteria.
+*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig;
+*
+                  KB = K - 1
+                  IF = I - 1
+*
+*              Apply the block reflector to the residual of the
+*              matrix A and the residual of the right hand sides B, if
+*              the residual matrix and and/or the residual of the right
+*              hand sides exist,  i.e. if the submatrix
+*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when
+*                 KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H.
+*
+               IF( KB.LT.MINMNUPDT ) THEN
+                  CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $                  M-IF, N+NRHS-KB, KB,-CONE, A( IF+1, 1 ), LDA,
+     $                  F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,K:N):
+*        1) swap the K-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1)
+*        3) copy the K-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than K in the next loop step.)
+*        4) Save the pivot interchange with the indices relative to the
+*           the original matrix A_orig, not the block A(1:M,1:N).
+*
+         IF( KP.NE.K ) THEN
+            CALL CSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 )
+            CALL CSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF )
+            VN1( KP ) = VN1( K )
+            VN2( KP ) = VN2( K )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( K )
+            JPIV( K ) = ITEMP
+         END IF
+*
+*        Apply previous Householder reflectors to column K:
+*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H.
+*
+         IF( K.GT.1 ) THEN
+            DO J = 1, K - 1
+               F( K, J ) = CONJG( F( K, J ) )
+            END DO
+            CALL CGEMV( 'No transpose', M-I+1, K-1, -CONE, A( I, 1 ),
+     $                  LDA, F( K, 1 ), LDF, CONE, A( I, K ), 1 )
+            DO J = 1, K - 1
+               F( K, J ) = CONJG( F( K, J ) )
+            END DO
+         END IF
+*
+*        Generate elementary reflector H(k) using the column A(I:M,K).
+*
+         IF( I.LT.M ) THEN
+            CALL CLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) )
+         ELSE
+            TAU( K ) = CZERO
+         END IF
+*
+*        Check if TAU(K) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(K) for Inf,
+*        since CLARFG cannot produce TAU(KK) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by CLARFG can contain Inf, which requires
+*        TAU(K) to contain NaN. Therefore, this case of generating Inf
+*        by CLARFG is covered by checking TAU(K) for NaN.
+*
+         IF( SISNAN( REAL( TAU(K) ) ) ) THEN
+            TAUNAN = REAL( TAU(K) )
+         ELSE IF( SISNAN( IMAG( TAU(K) ) ) ) THEN
+            TAUNAN = IMAG( TAU(K) )
+         ELSE
+            TAUNAN = ZERO
+         END IF
+*
+         IF( SISNAN( TAUNAN ) ) THEN
+*
+            DONE = .TRUE.
+*
+*           Set KB, the number of factorized partial columns
+*                   that are non-zero in each step in the block,
+*                   i.e. the rank of the factor R.
+*           Set IF, the number of processed rows in the block, which
+*                   is the same as the number of processed rows in
+*                   the original whole matrix A_orig.
+*
+            KB = K - 1
+            IF = I - 1
+            INFO = K
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAUNAN
+            RELMAXC2NRMK = TAUNAN
+*
+*           There is no need to apply the block reflector to the
+*           residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*           since the submatrix contains NaN and we stop
+*           the computation.
+*           But, we need to apply the block reflector to the residual
+*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*           residual right hand sides exist.  This occurs
+*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H.
+*
+            IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+               CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $               M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA,
+     $               F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA )
+            END IF
+*
+*           There is no need to recompute the 2-norm of the
+*           difficult columns, since we stop the factorization.
+*
+*           Array TAU(KF+1:MINMNFACT) is not set and contains
+*           undefined elements.
+*
+*           Return from the routine.
+*
+            RETURN
+         END IF
+*
+*        ===============================================================
+*
+         AIK = A( I, K )
+         A( I, K ) = CONE
+*
+*        ===============================================================
+*
+*        Compute the current K-th column of F:
+*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K).
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL CGEMV( 'Conjugate transpose', M-I+1, N+NRHS-K,
+     $                  TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1,
+     $                  CZERO, F( K+1, K ), 1 )
+         END IF
+*
+*           2) Zero out elements above and on the diagonal of the
+*              column K in matrix F, i.e elements F(1:K,K).
+*
+         DO J = 1, K
+            F( J, K ) = CZERO
+         END DO
+*
+*         3) Incremental updating of the K-th column of F:
+*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H
+*                    * A(I:M,K).
+*
+         IF( K.GT.1 ) THEN
+            CALL CGEMV( 'Conjugate Transpose', M-I+1, K-1, -TAU( K ),
+     $                   A( I, 1 ), LDA, A( I, K ), 1, CZERO,
+     $                   AUXV( 1 ), 1 )
+*
+            CALL CGEMV( 'No transpose', N+NRHS, K-1, CONE,
+     $                  F( 1, 1 ), LDF, AUXV( 1 ), 1, CONE,
+     $                  F( 1, K ), 1 )
+         END IF
+*
+*        ===============================================================
+*
+*        Update the current I-th row of A:
+*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS)
+*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**H.
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $            1, N+NRHS-K, K, -CONE, A( I, 1 ), LDA,
+     $            F( K+1, 1 ), LDF, CONE, A( I, K+1 ), LDA )
+         END IF
+*
+         A( I, K ) = AIK
+*
+*        Update the partial column 2-norms for the residual matrix,
+*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e.
+*        when K < MINMNFACT = min( M-IOFFSET, N ).
+*
+         IF( K.LT.MINMNFACT ) THEN
+*
+            DO J = K + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ABS( A( I, J ) ) / VN1( J )
+                  TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2.LE.TOL3Z ) THEN
+*
+*                    At J-index, we have a difficult column for the
+*                    update of the 2-norm. Save the index of the previous
+*                    difficult column in IWORK(J-1).
+*                    NOTE: ILSTCC > 1, threfore we can use IWORK only
+*                    with N-1 elements, where the elements are
+*                    shifted by 1 to the left.
+*
+                     IWORK( J-1 ) = LSTICC
+*
+*                    Set the index of the last difficult column LSTICC.
+*
+                     LSTICC = J
+*
+                  ELSE
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*        End of while loop.
+*
+      END DO
+*
+*     Now, afler the loop:
+*        Set KB, the number of factorized columns in the block;
+*        Set IF, the number of processed rows in the block, which
+*                is the same as the number of processed rows in
+*                the original whole matrix A_orig, IF = IOFFSET + KB.
+*
+      KB = K
+      IF = I
+*
+*     Apply the block reflector to the residual of the matrix A
+*     and the residual of the right hand sides B, if the residual
+*     matrix and and/or the residual of the right hand sides
+*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists.
+*     This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H.
+*
+      IF( KB.LT.MINMNUPDT ) THEN
+         CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $         M-IF, N+NRHS-KB, KB, -CONE, A( IF+1, 1 ), LDA,
+     $         F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA )
+      END IF
+*
+*     Recompute the 2-norm of the difficult columns.
+*     Loop over the index of the difficult columns from the largest
+*     to the smallest index.
+*
+      DO WHILE( LSTICC.GT.0 )
+*
+*        LSTICC is the index of the last difficult column is greater
+*        than 1.
+*        ITEMP is the index of the previous difficult column.
+*
+         ITEMP = IWORK( LSTICC-1 )
+*
+*        Compute the 2-norm explicilty for the last difficult column and
+*        save it in the partial and exact 2-norm vectors VN1 and VN2.
+*
+*        NOTE: The computation of VN1( LSTICC ) relies on the fact that
+*        SCNRM2 does not fail on vectors with norm below the value of
+*        SQRT(SLAMCH('S'))
+*
+         VN1( LSTICC ) = SCNRM2( M-IF, A( IF+1, LSTICC ), 1 )
+         VN2( LSTICC ) = VN1( LSTICC )
+*
+*        Downdate the index of the last difficult column to
+*        the index of the previous difficult column.
+*
+         LSTICC = ITEMP
+*
+      END DO
+*
+      RETURN
+*
+*     End of CLAQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/claswlq.f b/lapack-netlib/SRC/claswlq.f
index 12e8373df..2044e055c 100644
--- a/lapack-netlib/SRC/claswlq.f
+++ b/lapack-netlib/SRC/claswlq.f
@@ -96,22 +96,24 @@
 *>          The leading dimension of the array T.  LDT >= MB.
 *> \endverbatim
 *>
-*>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= MB*M.
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -163,33 +165,35 @@
 *>
 *  =====================================================================
       SUBROUTINE CLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
-     $                  INFO)
+     $                    INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LWORK, LDT
+      INTEGER            INFO, LDA, M, N, MB, NB, LWORK, LDT
 *     ..
 *     .. Array Arguments ..
-      COMPLEX           A( LDA, * ), WORK( * ), T( LDT, *)
+      COMPLEX            A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, MINMN, LWMIN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
       INTEGER            ILAENV
       REAL               SROUNDUP_LWORK
       EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
       EXTERNAL           CGELQT, CTPLQT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -200,12 +204,19 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = M*MB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
       ELSE IF( N.LT.0 .OR. N.LT.M ) THEN
         INFO = -2
-      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN
+      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN
         INFO = -3
       ELSE IF( NB.LE.0 ) THEN
         INFO = -4
@@ -213,60 +224,61 @@
         INFO = -6
       ELSE IF( LDT.LT.MB ) THEN
         INFO = -8
-      ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-      WORK(1) = SROUNDUP_LWORK(MB*M)
+*
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CLASWLQ', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The LQ Decomposition
 *
-       IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN
+      IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN
         CALL CGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO)
         RETURN
-       END IF
+      END IF
 *
-       KK = MOD((N-M),(NB-M))
-       II=N-KK+1
+      KK = MOD((N-M),(NB-M))
+      II = N-KK+1
 *
-*      Compute the LQ factorization of the first block A(1:M,1:NB)
+*     Compute the LQ factorization of the first block A(1:M,1:NB)
 *
-       CALL CGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO)
-       CTR = 1
+      CALL CGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO)
+      CTR = 1
 *
-       DO I = NB+1, II-NB+M , (NB-M)
+      DO I = NB+1, II-NB+M , (NB-M)
 *
-*      Compute the QR factorization of the current block A(1:M,I:I+NB-M)
+*       Compute the QR factorization of the current block A(1:M,I:I+NB-M)
 *
-         CALL CTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
+        CALL CTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
      $                  LDA, T(1,CTR*M+1),
      $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CTR = CTR + 1
+      END DO
 *
 *     Compute the QR factorization of the last block A(1:M,II:N)
 *
-       IF (II.LE.N) THEN
+      IF( II.LE.N ) THEN
         CALL CTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ),
      $                  LDA, T(1,CTR*M+1), LDT,
      $                  WORK, INFO )
-       END IF
+      END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(M * MB)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CLASWLQ
diff --git a/lapack-netlib/SRC/clatrs3.f b/lapack-netlib/SRC/clatrs3.f
index 0502f6898..354141a8b 100644
--- a/lapack-netlib/SRC/clatrs3.f
+++ b/lapack-netlib/SRC/clatrs3.f
@@ -152,13 +152,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (LWORK).
+*>          WORK is REAL array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal size of
 *>          WORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
+*> \verbatim
 *>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else
 *>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
 *>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
 *>
@@ -166,6 +170,7 @@
 *>          only calculates the optimal dimensions of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
@@ -182,7 +187,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERauxiliary
+*> \ingroup latrs3
 *> \par Further Details:
 *  =====================
 *  \verbatim
@@ -257,15 +262,16 @@
       LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
       INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
      $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
-     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
+     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN
       REAL               ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
      $                   SCAMIN, SMLNUM, TMAX
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, CLANGE, SLARMM
-      EXTERNAL           ILAENV, LSAME, SLAMCH, CLANGE, SLARMM
+      REAL               SLAMCH, CLANGE, SLARMM, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SLAMCH, CLANGE, SLARMM,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLATRS, CSSCAL, XERBLA
@@ -296,15 +302,24 @@
 *     row. WORK( I + KK * LDS ) is the scale factor of the vector
 *     segment associated with the I-th block row and the KK-th vector
 *     in the block column.
+*
       LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
       LDS = NBA
+*
 *     The second part stores upper bounds of the triangular A. There are
 *     a total of NBA x NBA blocks, of which only the upper triangular
 *     part or the lower triangular part is referenced. The upper bound of
 *     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
+*
       LANRM = NBA * NBA
       AWRK = LSCALE
-      WORK( 1 ) = LSCALE + LANRM
+*
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = LSCALE + LANRM
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
 *     Test the input parameters.
 *
@@ -326,7 +341,7 @@
          INFO = -8
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -10
-      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
+      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN
          INFO = -14
       END IF
       IF( INFO.NE.0 ) THEN
@@ -659,6 +674,9 @@
             END IF
          END DO
       END DO
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
+*
       RETURN
 *
 *     End of CLATRS3
diff --git a/lapack-netlib/SRC/clatsqr.f b/lapack-netlib/SRC/clatsqr.f
index cd2cb4aa7..67403693f 100644
--- a/lapack-netlib/SRC/clatsqr.f
+++ b/lapack-netlib/SRC/clatsqr.f
@@ -101,15 +101,18 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= NB*N.
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -165,32 +168,34 @@
 *>
 *  =====================================================================
       SUBROUTINE CLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK,
-     $                    LWORK, INFO)
+     $                    LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LDT, LWORK
+      INTEGER            INFO, LDA, M, N, MB, NB, LDT, LWORK
 *     ..
 *     .. Array Arguments ..
-      COMPLEX           A( LDA, * ), WORK( * ), T(LDT, *)
+      COMPLEX            A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, LWMIN, MINMN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
       REAL               SROUNDUP_LWORK
       EXTERNAL           LSAME, SROUNDUP_LWORK
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
-      EXTERNAL    CGEQRT, CTPQRT, XERBLA
+      EXTERNAL           CGEQRT, CTPQRT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -201,6 +206,13 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = N*NB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
@@ -208,64 +220,65 @@
         INFO = -2
       ELSE IF( MB.LT.1 ) THEN
         INFO = -3
-      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN
+      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN
         INFO = -4
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
       ELSE IF( LDT.LT.NB ) THEN
         INFO = -8
-      ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-        WORK(1) = SROUNDUP_LWORK(NB*N)
+*
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CLATSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The QR Decomposition
 *
-       IF ((MB.LE.N).OR.(MB.GE.M)) THEN
-         CALL CGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO)
-         RETURN
-       END IF
-       KK = MOD((M-N),(MB-N))
-       II=M-KK+1
+      IF ( (MB.LE.N) .OR. (MB.GE.M) ) THEN
+        CALL CGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO )
+        RETURN
+      END IF
+      KK = MOD((M-N),(MB-N))
+      II = M-KK+1
 *
-*      Compute the QR factorization of the first block A(1:MB,1:N)
+*     Compute the QR factorization of the first block A(1:MB,1:N)
 *
-       CALL CGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
-       CTR = 1
+      CALL CGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CTR = 1
 *
-       DO I = MB+1, II-MB+N ,  (MB-N)
+      DO I = MB+1, II-MB+N, (MB-N)
 *
-*      Compute the QR factorization of the current block A(I:I+MB-N,1:N)
+*       Compute the QR factorization of the current block A(I:I+MB-N,1:N)
 *
-         CALL CTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
+        CALL CTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
      $                 T(1,CTR * N + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+     $                 LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
-*      Compute the QR factorization of the last block A(II:M,1:N)
+*     Compute the QR factorization of the last block A(II:M,1:N)
 *
-       IF (II.LE.M) THEN
-         CALL CTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
+      IF( II.LE.M ) THEN
+        CALL CTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
      $                 T(1, CTR * N + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+     $                 WORK, INFO )
+      END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(N*NB)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CLATSQR
diff --git a/lapack-netlib/SRC/dbbcsd.f b/lapack-netlib/SRC/dbbcsd.f
index 4fcf9ab5d..913f96a73 100644
--- a/lapack-netlib/SRC/dbbcsd.f
+++ b/lapack-netlib/SRC/dbbcsd.f
@@ -805,7 +805,7 @@
                CALL DLARTGP( B22BULGE, B22E(I-1), WORK(IU2SN+I-1),
      $                       WORK(IU2CS+I-1), R )
             ELSE IF( NU .LT. MU ) THEN
-               CALL DLARTGS( B21E(I), B21E(I+1), NU, WORK(IU2CS+I-1),
+               CALL DLARTGS( B21E(I), B21D(I+1), NU, WORK(IU2CS+I-1),
      $                       WORK(IU2SN+I-1) )
             ELSE
                CALL DLARTGS( B22D(I), B22E(I), MU, WORK(IU2CS+I-1),
diff --git a/lapack-netlib/SRC/dgebrd.f b/lapack-netlib/SRC/dgebrd.f
index 0f0d1651a..ac11d48a0 100644
--- a/lapack-netlib/SRC/dgebrd.f
+++ b/lapack-netlib/SRC/dgebrd.f
@@ -122,7 +122,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= max(1,M,N).
+*>          The length of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise.
 *>          For optimum performance LWORK >= (M+N)*NB, where NB
 *>          is the optimal blocksize.
 *>
@@ -147,7 +148,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gebrd
 *
 *> \par Further Details:
 *  =====================
@@ -223,8 +224,8 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB,
-     $                   NBMIN, NX, WS
+      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT,
+     $                   MINMN, NB, NBMIN, NX, WS
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DGEBD2, DGEMM, DLABRD, XERBLA
@@ -241,9 +242,17 @@
 *     Test the input parameters
 *
       INFO = 0
-      NB = MAX( 1, ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) )
-      LWKOPT = ( M+N )*NB
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = MAX( M, N )
+         NB = MAX( 1, ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) )
+         LWKOPT = ( M+N )*NB
+      ENDIF
       WORK( 1 ) = DBLE( LWKOPT )
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -251,7 +260,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.LT.0 ) THEN
@@ -263,7 +272,6 @@
 *
 *     Quick return if possible
 *
-      MINMN = MIN( M, N )
       IF( MINMN.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -282,7 +290,7 @@
 *        Determine when to switch from blocked to unblocked code.
 *
          IF( NX.LT.MINMN ) THEN
-            WS = ( M+N )*NB
+            WS = LWKOPT
             IF( LWORK.LT.WS ) THEN
 *
 *              Not enough work space for the optimal NB, consider using
diff --git a/lapack-netlib/SRC/dgedmd.f90 b/lapack-netlib/SRC/dgedmd.f90
index 20424808f..15df48fe9 100644
--- a/lapack-netlib/SRC/dgedmd.f90
+++ b/lapack-netlib/SRC/dgedmd.f90
@@ -1,424 +1,574 @@
-      SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
-                         M, N, X, LDX, Y, LDY, NRNK, TOL,  &
-                         K, REIG,  IMEIG,   Z, LDZ,  RES,  &
-                         B, LDB, W,  LDW,   S, LDS,        &
-                         WORK, LWORK, IWORK, LIWORK, INFO )
-! March 2023
+!> \brief \b DGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices.
+!
+!  =========== DOCUMENTATION ===========
+!
+!  Definition:
+!  ===========
+!
+!     SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
+!                        M, N, X, LDX, Y, LDY, NRNK, TOL,  &
+!                        K, REIG,  IMEIG,   Z, LDZ,  RES,  &
+!                        B, LDB, W,  LDW,   S, LDS,        &
+!                        WORK, LWORK, IWORK, LIWORK, INFO )
+!
 !.....
-      USE                   iso_fortran_env
-      IMPLICIT NONE
-      INTEGER, PARAMETER :: WP = real64
+!     USE                   iso_fortran_env
+!     IMPLICIT NONE
+!     INTEGER, PARAMETER :: WP = real64
 !.....
 !     Scalar arguments
-      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
-      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
-                                 NRNK, LDZ, LDB, LDW,  LDS, &
-                                 LWORK,  LIWORK
-      INTEGER,   INTENT(OUT)  :: K, INFO
-      REAL(KIND=WP), INTENT(IN)  :: TOL
+!     CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+!     INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+!                                NRNK, LDZ, LDB, LDW,  LDS, &
+!                                LWORK,  LIWORK
+!     INTEGER,   INTENT(OUT)  :: K, INFO
+!     REAL(KIND=WP), INTENT(IN)  :: TOL
 !     Array arguments
-      REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
-      REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
-                                      W(LDW,*), S(LDS,*)
-      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
-                                      RES(*)
-      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
-      INTEGER,       INTENT(OUT)   :: IWORK(*)
-!............................................................
-!     Purpose
-!     =======
-!     DGEDMD computes the Dynamic Mode Decomposition (DMD) for
-!     a pair of data snapshot matrices. For the input matrices
-!     X and Y such that Y = A*X with an unaccessible matrix
-!     A, DGEDMD computes a certain number of Ritz pairs of A using
-!     the standard Rayleigh-Ritz extraction from a subspace of
-!     range(X) that is determined using the leading left singular
-!     vectors of X. Optionally, DGEDMD returns the residuals
-!     of the computed Ritz pairs, the information needed for
-!     a refinement of the Ritz vectors, or the eigenvectors of
-!     the Exact DMD.
-!     For further details see the references listed
-!     below. For more details of the implementation see [3].
-!
-!     References
-!     ==========
-!     [1] P. Schmid: Dynamic mode decomposition of numerical
-!         and experimental data,
-!         Journal of Fluid Mechanics 656, 5-28, 2010.
-!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
-!         decompositions: analysis and enhancements,
-!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
-!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
-!         Mode Decomposition I. Technical report. AIMDyn Inc.
-!         and LAPACK Working Note 298.
-!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
-!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
-!         Theory and Applications, Journal of Computational
-!         Dynamics 1(2), 391 -421, 2014.
+!     REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+!     REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+!                                     W(LDW,*), S(LDS,*)
+!     REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
+!                                     RES(*)
+!     REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
+!     INTEGER,       INTENT(OUT)   :: IWORK(*)
 !
-!......................................................................
-!     Developed and supported by:
-!     ===========================
-!     Developed and coded by Zlatko Drmac, Faculty of Science,
-!     University of Zagreb;  drmac@math.hr
-!     In cooperation with
-!     AIMdyn Inc., Santa Barbara, CA.
-!     and supported by
-!     - DARPA SBIR project "Koopman Operator-Based Forecasting
-!     for Nonstationary Processes from Near-Term, Limited
-!     Observational Data" Contract No: W31P4Q-21-C-0007
-!     - DARPA PAI project "Physics-Informed Machine Learning
-!     Methodologies" Contract No: HR0011-18-9-0033
-!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
-!     Framework for Space-Time Analysis of Process Dynamics"
-!     Contract No: HR0011-16-C-0116
-!     Any opinions, findings and conclusions or recommendations
-!     expressed in this material are those of the author and
-!     do not necessarily reflect the views of the DARPA SBIR
-!     Program Office
-!============================================================
-!     Distribution Statement A:
-!     Approved for Public Release, Distribution Unlimited.
-!     Cleared by DARPA on September 29, 2022
-!============================================================
 !............................................................
+!>    \par Purpose:
+!     =============
+!>    \verbatim
+!>    DGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!>    a pair of data snapshot matrices. For the input matrices
+!>    X and Y such that Y = A*X with an unaccessible matrix
+!>    A, DGEDMD computes a certain number of Ritz pairs of A using
+!>    the standard Rayleigh-Ritz extraction from a subspace of
+!>    range(X) that is determined using the leading left singular
+!>    vectors of X. Optionally, DGEDMD returns the residuals
+!>    of the computed Ritz pairs, the information needed for
+!>    a refinement of the Ritz vectors, or the eigenvectors of
+!>    the Exact DMD.
+!>    For further details see the references listed
+!>    below. For more details of the implementation see [3].
+!>    \endverbatim
+!............................................................
+!>    \par References:
+!     ================
+!>    \verbatim
+!>    [1] P. Schmid: Dynamic mode decomposition of numerical
+!>        and experimental data,
+!>        Journal of Fluid Mechanics 656, 5-28, 2010.
+!>    [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!>        decompositions: analysis and enhancements,
+!>        SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!>    [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!>        Mode Decomposition I. Technical report. AIMDyn Inc.
+!>        and LAPACK Working Note 298.
+!>    [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!>        Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!>        Theory and Applications, Journal of Computational
+!>        Dynamics 1(2), 391 -421, 2014.
+!>    \endverbatim
+!......................................................................
+!>    \par Developed and supported by:
+!     ================================
+!>    \verbatim
+!>    Developed and coded by Zlatko Drmac, Faculty of Science,
+!>    University of Zagreb;  drmac@math.hr
+!>    In cooperation with
+!>    AIMdyn Inc., Santa Barbara, CA.
+!>    and supported by
+!>    - DARPA SBIR project "Koopman Operator-Based Forecasting
+!>    for Nonstationary Processes from Near-Term, Limited
+!>    Observational Data" Contract No: W31P4Q-21-C-0007
+!>    - DARPA PAI project "Physics-Informed Machine Learning
+!>    Methodologies" Contract No: HR0011-18-9-0033
+!>    - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!>    Framework for Space-Time Analysis of Process Dynamics"
+!>    Contract No: HR0011-16-C-0116
+!>    Any opinions, findings and conclusions or recommendations
+!>    expressed in this material are those of the author and
+!>    do not necessarily reflect the views of the DARPA SBIR
+!>    Program Office
+!>    \endverbatim
+!......................................................................
+!>    \par Distribution Statement A:
+!     ==============================
+!>    \verbatim
+!>    Approved for Public Release, Distribution Unlimited.
+!>    Cleared by DARPA on September 29, 2022
+!>    \endverbatim
+!......................................................................
 !     Arguments
 !     =========
-!     JOBS (input) CHARACTER*1
-!     Determines whether the initial data snapshots are scaled
-!     by a diagonal matrix.
-!     'S' :: The data snapshots matrices X and Y are multiplied
-!            with a diagonal matrix D so that X*D has unit
-!            nonzero columns (in the Euclidean 2-norm)
-!     'C' :: The snapshots are scaled as with the 'S' option.
-!            If it is found that an i-th column of X is zero
-!            vector and the corresponding i-th column of Y is
-!            non-zero, then the i-th column of Y is set to
-!            zero and a warning flag is raised.
-!     'Y' :: The data snapshots matrices X and Y are multiplied
-!            by a diagonal matrix D so that Y*D has unit
-!            nonzero columns (in the Euclidean 2-norm)
-!     'N' :: No data scaling.
+!
+!>    \param[in] JOBS
+!>    \verbatim
+!>    JOBS (input) is CHARACTER*1
+!>    Determines whether the initial data snapshots are scaled
+!>    by a diagonal matrix.
+!>    'S' :: The data snapshots matrices X and Y are multiplied
+!>           with a diagonal matrix D so that X*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'C' :: The snapshots are scaled as with the 'S' option.
+!>           If it is found that an i-th column of X is zero
+!>           vector and the corresponding i-th column of Y is
+!>           non-zero, then the i-th column of Y is set to
+!>           zero and a warning flag is raised.
+!>    'Y' :: The data snapshots matrices X and Y are multiplied
+!>           by a diagonal matrix D so that Y*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'N' :: No data scaling.
+!>    \endverbatim
 !.....
-!     JOBZ (input) CHARACTER*1
-!     Determines whether the eigenvectors (Koopman modes) will
-!     be computed.
-!     'V' :: The eigenvectors (Koopman modes) will be computed
-!            and returned in the matrix Z.
-!            See the description of Z.
-!     'F' :: The eigenvectors (Koopman modes) will be returned
-!            in factored form as the product X(:,1:K)*W, where X
-!            contains a POD basis (leading left singular vectors
-!            of the data matrix X) and W contains the eigenvectors
-!            of the corresponding Rayleigh quotient.
-!            See the descriptions of K, X, W, Z.
-!     'N' :: The eigenvectors are not computed.
+!>    \param[in] JOBZ
+!>    \verbatim
+!>    JOBZ (input) CHARACTER*1
+!>    Determines whether the eigenvectors (Koopman modes) will
+!>    be computed.
+!>    'V' :: The eigenvectors (Koopman modes) will be computed
+!>           and returned in the matrix Z.
+!>           See the description of Z.
+!>    'F' :: The eigenvectors (Koopman modes) will be returned
+!>           in factored form as the product X(:,1:K)*W, where X
+!>           contains a POD basis (leading left singular vectors
+!>           of the data matrix X) and W contains the eigenvectors
+!>           of the corresponding Rayleigh quotient.
+!>           See the descriptions of K, X, W, Z.
+!>    'N' :: The eigenvectors are not computed.
+!>    \endverbatim
 !.....
-!     JOBR (input) CHARACTER*1
-!     Determines whether to compute the residuals.
-!     'R' :: The residuals for the computed eigenpairs will be
-!            computed and stored in the array RES.
-!            See the description of RES.
-!            For this option to be legal, JOBZ must be 'V'.
-!     'N' :: The residuals are not computed.
+!>    \param[in] JOBR
+!>    \verbatim
+!>    JOBR (input) CHARACTER*1
+!>    Determines whether to compute the residuals.
+!>    'R' :: The residuals for the computed eigenpairs will be
+!>           computed and stored in the array RES.
+!>           See the description of RES.
+!>           For this option to be legal, JOBZ must be 'V'.
+!>    'N' :: The residuals are not computed.
+!>    \endverbatim
 !.....
-!     JOBF (input) CHARACTER*1
-!     Specifies whether to store information needed for post-
-!     processing (e.g. computing refined Ritz vectors)
-!     'R' :: The matrix needed for the refinement of the Ritz
-!            vectors is computed and stored in the array B.
-!            See the description of B.
-!     'E' :: The unscaled eigenvectors of the Exact DMD are
-!            computed and returned in the array B. See the
-!            description of B.
-!     'N' :: No eigenvector refinement data is computed.
+!>    \param[in] JOBF
+!>    \verbatim
+!>    JOBF (input) CHARACTER*1
+!>    Specifies whether to store information needed for post-
+!>    processing (e.g. computing refined Ritz vectors)
+!>    'R' :: The matrix needed for the refinement of the Ritz
+!>           vectors is computed and stored in the array B.
+!>           See the description of B.
+!>    'E' :: The unscaled eigenvectors of the Exact DMD are
+!>           computed and returned in the array B. See the
+!>           description of B.
+!>    'N' :: No eigenvector refinement data is computed.
+!>    \endverbatim
 !.....
-!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
-!     Allows for a selection of the SVD algorithm from the
-!     LAPACK library.
-!     1 :: DGESVD (the QR SVD algorithm)
-!     2 :: DGESDD (the Divide and Conquer algorithm; if enough
-!          workspace available, this is the fastest option)
-!     3 :: DGESVDQ (the preconditioned QR SVD  ; this and 4
-!          are the most accurate options)
-!     4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3
-!          are the most accurate options)
-!     For the four methods above, a significant difference in
-!     the accuracy of small singular values is possible if
-!     the snapshots vary in norm so that X is severely
-!     ill-conditioned. If small (smaller than EPS*||X||)
-!     singular values are of interest and JOBS=='N',  then
-!     the options (3, 4) give the most accurate results, where
-!     the option 4 is slightly better and with stronger
-!     theoretical background.
-!     If JOBS=='S', i.e. the columns of X will be normalized,
-!     then all methods give nearly equally accurate results.
+!>    \param[in] WHTSVD
+!>    \verbatim
+!>    WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!>    Allows for a selection of the SVD algorithm from the
+!>    LAPACK library.
+!>    1 :: DGESVD (the QR SVD algorithm)
+!>    2 :: DGESDD (the Divide and Conquer algorithm; if enough
+!>         workspace available, this is the fastest option)
+!>    3 :: DGESVDQ (the preconditioned QR SVD  ; this and 4
+!>         are the most accurate options)
+!>    4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3
+!>         are the most accurate options)
+!>    For the four methods above, a significant difference in
+!>    the accuracy of small singular values is possible if
+!>    the snapshots vary in norm so that X is severely
+!>    ill-conditioned. If small (smaller than EPS*||X||)
+!>    singular values are of interest and JOBS=='N',  then
+!>    the options (3, 4) give the most accurate results, where
+!>    the option 4 is slightly better and with stronger
+!>    theoretical background.
+!>    If JOBS=='S', i.e. the columns of X will be normalized,
+!>    then all methods give nearly equally accurate results.
+!>    \endverbatim
 !.....
-!     M (input) INTEGER, M>= 0
-!     The state space dimension (the row dimension of X, Y).
+!>    \param[in] M
+!>    \verbatim
+!>    M (input) INTEGER, M>= 0
+!>    The state space dimension (the row dimension of X, Y).
+!>    \endverbatim
 !.....
-!     N (input) INTEGER, 0 <= N <= M
-!     The number of data snapshot pairs
-!     (the number of columns of X and Y).
+!>    \param[in] N
+!>    \verbatim
+!>    N (input) INTEGER, 0 <= N <= M
+!>    The number of data snapshot pairs
+!>    (the number of columns of X and Y).
+!>    \endverbatim
 !.....
-!     X (input/output) REAL(KIND=WP) M-by-N array
-!     > On entry, X contains the data snapshot matrix X. It is
-!     assumed that the column norms of X are in the range of
-!     the normalized floating point numbers.
-!     < On exit, the leading K columns of X contain a POD basis,
-!     i.e. the leading K left singular vectors of the input
-!     data matrix X, U(:,1:K). All N columns of X contain all
-!     left singular vectors of the input matrix X.
-!     See the descriptions of K, Z and W.
+!>    \param[in,out] X
+!>    \verbatim
+!>    X (input/output) REAL(KIND=WP) M-by-N array
+!>    > On entry, X contains the data snapshot matrix X. It is
+!>    assumed that the column norms of X are in the range of
+!>    the normalized floating point numbers.
+!>    < On exit, the leading K columns of X contain a POD basis,
+!>    i.e. the leading K left singular vectors of the input
+!>    data matrix X, U(:,1:K). All N columns of X contain all
+!>    left singular vectors of the input matrix X.
+!>    See the descriptions of K, Z and W.
+!>    \endverbatim
 !.....
-!     LDX (input) INTEGER, LDX >= M
-!     The leading dimension of the array X.
+!>    \param[in] LDX
+!>    \verbatim
+!>    LDX (input) INTEGER, LDX >= M
+!>    The leading dimension of the array X.
+!>    \endverbatim
 !.....
-!     Y (input/workspace/output) REAL(KIND=WP) M-by-N array
-!     > On entry, Y contains the data snapshot matrix Y
-!     < On exit,
-!     If JOBR == 'R', the leading K columns of Y  contain
-!     the residual vectors for the computed Ritz pairs.
-!     See the description of RES.
-!     If JOBR == 'N', Y contains the original input data,
-!                     scaled according to the value of JOBS.
+!>    \param[in,out] Y
+!>    \verbatim
+!>    Y (input/workspace/output) REAL(KIND=WP) M-by-N array
+!>    > On entry, Y contains the data snapshot matrix Y
+!>    < On exit,
+!>    If JOBR == 'R', the leading K columns of Y  contain
+!>    the residual vectors for the computed Ritz pairs.
+!>    See the description of RES.
+!>    If JOBR == 'N', Y contains the original input data,
+!>                    scaled according to the value of JOBS.
+!>    \endverbatim
 !.....
-!     LDY (input) INTEGER , LDY >= M
-!     The leading dimension of the array Y.
+!>    \param[in] LDY
+!>    \verbatim
+!>    LDY (input) INTEGER , LDY >= M
+!>    The leading dimension of the array Y.
+!>    \endverbatim
 !.....
-!     NRNK (input) INTEGER
-!     Determines the mode how to compute the numerical rank,
-!     i.e. how to truncate small singular values of the input
-!     matrix X. On input, if
-!     NRNK = -1 :: i-th singular value sigma(i) is truncated
-!                  if sigma(i) <= TOL*sigma(1).
-!                  This option is recommended.
-!     NRNK = -2 :: i-th singular value sigma(i) is truncated
-!                  if sigma(i) <= TOL*sigma(i-1)
-!                  This option is included for R&D purposes.
-!                  It requires highly accurate SVD, which
-!                  may not be feasible.
-!
-!     The numerical rank can be enforced by using positive
-!     value of NRNK as follows:
-!     0 < NRNK <= N :: at most NRNK largest singular values
-!     will be used. If the number of the computed nonzero
-!     singular values is less than NRNK, then only those
-!     nonzero values will be used and the actually used
-!     dimension is less than NRNK. The actual number of
-!     the nonzero singular values is returned in the variable
-!     K. See the descriptions of TOL and  K.
+!>    \param[in] NRNK
+!>    \verbatim
+!>    NRNK (input) INTEGER
+!>    Determines the mode how to compute the numerical rank,
+!>    i.e. how to truncate small singular values of the input
+!>    matrix X. On input, if
+!>    NRNK = -1 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(1).
+!>                 This option is recommended.
+!>    NRNK = -2 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(i-1)
+!>                 This option is included for R&D purposes.
+!>                 It requires highly accurate SVD, which
+!>                 may not be feasible.
+!>
+!>    The numerical rank can be enforced by using positive
+!>    value of NRNK as follows:
+!>    0 < NRNK <= N :: at most NRNK largest singular values
+!>    will be used. If the number of the computed nonzero
+!>    singular values is less than NRNK, then only those
+!>    nonzero values will be used and the actually used
+!>    dimension is less than NRNK. The actual number of
+!>    the nonzero singular values is returned in the variable
+!>    K. See the descriptions of TOL and  K.
+!>    \endverbatim
 !.....
-!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
-!     The tolerance for truncating small singular values.
-!     See the description of NRNK.
+!>    \param[in] TOL
+!>    \verbatim
+!>    TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!>    The tolerance for truncating small singular values.
+!>    See the description of NRNK.
+!>    \endverbatim
 !.....
-!     K (output) INTEGER,  0 <= K <= N
-!     The dimension of the POD basis for the data snapshot
-!     matrix X and the number of the computed Ritz pairs.
-!     The value of K is determined according to the rule set
-!     by the parameters NRNK and TOL.
-!     See the descriptions of NRNK and TOL.
+!>    \param[out] K
+!>    \verbatim
+!>    K (output) INTEGER,  0 <= K <= N
+!>    The dimension of the POD basis for the data snapshot
+!>    matrix X and the number of the computed Ritz pairs.
+!>    The value of K is determined according to the rule set
+!>    by the parameters NRNK and TOL.
+!>    See the descriptions of NRNK and TOL.
+!>    \endverbatim
 !.....
-!     REIG (output) REAL(KIND=WP) N-by-1 array
-!     The leading K (K<=N) entries of REIG contain
-!     the real parts of the computed eigenvalues
-!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
-!     See the descriptions of K, IMEIG, and Z.
+!>    \param[out] REIG
+!>    \verbatim
+!>    REIG (output) REAL(KIND=WP) N-by-1 array
+!>    The leading K (K<=N) entries of REIG contain
+!>    the real parts of the computed eigenvalues
+!>    REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!>    See the descriptions of K, IMEIG, and Z.
+!>    \endverbatim
 !.....
-!     IMEIG (output) REAL(KIND=WP) N-by-1 array
-!     The leading K (K<=N) entries of IMEIG contain
-!     the imaginary parts of the computed eigenvalues
-!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
-!     The eigenvalues are determined as follows:
-!     If IMEIG(i) == 0, then the corresponding eigenvalue is
-!     real, LAMBDA(i) = REIG(i).
-!     If IMEIG(i)>0, then the corresponding complex
-!     conjugate pair of eigenvalues reads
-!     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
-!     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
-!     That is, complex conjugate pairs have consecutive
-!     indices (i,i+1), with the positive imaginary part
-!     listed first.
-!     See the descriptions of K, REIG, and Z.
+!>    \param[out] IMEIG
+!>    \verbatim
+!>    IMEIG (output) REAL(KIND=WP) N-by-1 array
+!>    The leading K (K<=N) entries of IMEIG contain
+!>    the imaginary parts of the computed eigenvalues
+!>    REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!>    The eigenvalues are determined as follows:
+!>    If IMEIG(i) == 0, then the corresponding eigenvalue is
+!>    real, LAMBDA(i) = REIG(i).
+!>    If IMEIG(i)>0, then the corresponding complex
+!>    conjugate pair of eigenvalues reads
+!>    LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
+!>    LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
+!>    That is, complex conjugate pairs have consecutive
+!>    indices (i,i+1), with the positive imaginary part
+!>    listed first.
+!>    See the descriptions of K, REIG, and Z.
+!>    \endverbatim
 !.....
-!     Z (workspace/output) REAL(KIND=WP)  M-by-N array
-!     If JOBZ =='V' then
-!        Z contains real Ritz vectors as follows:
-!        If IMEIG(i)=0, then Z(:,i) is an eigenvector of
-!        the i-th Ritz value; ||Z(:,i)||_2=1.
-!        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
-!        [Z(:,i) Z(:,i+1)] span an invariant subspace and
-!        the Ritz values extracted from this subspace are
-!        REIG(i) + sqrt(-1)*IMEIG(i) and
-!        REIG(i) - sqrt(-1)*IMEIG(i).
-!        The corresponding eigenvectors are
-!        Z(:,i) + sqrt(-1)*Z(:,i+1) and
-!        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
-!        || Z(:,i:i+1)||_F = 1.
-!     If JOBZ == 'F', then the above descriptions hold for
-!     the columns of X(:,1:K)*W(1:K,1:K), where the columns
-!     of W(1:k,1:K) are the computed eigenvectors of the
-!     K-by-K Rayleigh quotient. The columns of W(1:K,1:K)
-!     are similarly structured: If IMEIG(i) == 0 then
-!     X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0
-!     then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and
-!          X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1)
-!     are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
-!     See the descriptions of REIG, IMEIG, X and W.
+!>    \param[out] Z
+!>    \verbatim
+!>    Z (workspace/output) REAL(KIND=WP)  M-by-N array
+!>    If JOBZ =='V' then
+!>       Z contains real Ritz vectors as follows:
+!>       If IMEIG(i)=0, then Z(:,i) is an eigenvector of
+!>       the i-th Ritz value; ||Z(:,i)||_2=1.
+!>       If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
+!>       [Z(:,i) Z(:,i+1)] span an invariant subspace and
+!>       the Ritz values extracted from this subspace are
+!>       REIG(i) + sqrt(-1)*IMEIG(i) and
+!>       REIG(i) - sqrt(-1)*IMEIG(i).
+!>       The corresponding eigenvectors are
+!>       Z(:,i) + sqrt(-1)*Z(:,i+1) and
+!>       Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
+!>       || Z(:,i:i+1)||_F = 1.
+!>    If JOBZ == 'F', then the above descriptions hold for
+!>    the columns of X(:,1:K)*W(1:K,1:K), where the columns
+!>    of W(1:k,1:K) are the computed eigenvectors of the
+!>    K-by-K Rayleigh quotient. The columns of W(1:K,1:K)
+!>    are similarly structured: If IMEIG(i) == 0 then
+!>    X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0
+!>    then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and
+!>         X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1)
+!>    are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
+!>    See the descriptions of REIG, IMEIG, X and W.
+!>    \endverbatim
 !.....
-!     LDZ (input) INTEGER , LDZ >= M
-!     The leading dimension of the array Z.
+!>    \param[in] LDZ
+!>    \verbatim
+!>    LDZ (input) INTEGER , LDZ >= M
+!>    The leading dimension of the array Z.
+!>    \endverbatim
 !.....
-!     RES (output) REAL(KIND=WP) N-by-1 array
-!     RES(1:K) contains the residuals for the K computed
-!     Ritz pairs.
-!     If LAMBDA(i) is real, then
-!        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
-!     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
-!     then
-!     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
-!     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
-!               [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
-!     It holds that
-!     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
-!     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
-!     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
-!           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
-!     See the description of REIG, IMEIG and Z.
+!>    \param[out] RES
+!>    \verbatim
+!>    RES (output) REAL(KIND=WP) N-by-1 array
+!>    RES(1:K) contains the residuals for the K computed
+!>    Ritz pairs.
+!>    If LAMBDA(i) is real, then
+!>       RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
+!>    If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
+!>    then
+!>    RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
+!>    where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
+!>              [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
+!>    It holds that
+!>    RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
+!>    RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
+!>    where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
+!>          ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
+!>    See the description of REIG, IMEIG and Z.
+!>    \endverbatim
 !.....
-!     B (output) REAL(KIND=WP)  M-by-N array.
-!     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
-!     be used for computing the refined vectors; see further
-!     details in the provided references.
-!     If JOBF == 'E', B(1:M,1;K) contains
-!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
-!     Exact DMD, up to scaling by the inverse eigenvalues.
-!     If JOBF =='N', then B is not referenced.
-!     See the descriptions of X, W, K.
+!>    \param[out] B
+!>    \verbatim
+!>    B (output) REAL(KIND=WP)  M-by-N array.
+!>    IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!>    be used for computing the refined vectors; see further
+!>    details in the provided references.
+!>    If JOBF == 'E', B(1:M,1;K) contains
+!>    A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!>    Exact DMD, up to scaling by the inverse eigenvalues.
+!>    If JOBF =='N', then B is not referenced.
+!>    See the descriptions of X, W, K.
+!>    \endverbatim
 !.....
-!     LDB (input) INTEGER, LDB >= M
-!     The leading dimension of the array B.
+!>    \param[in] LDB
+!>    \verbatim
+!>    LDB (input) INTEGER, LDB >= M
+!>    The leading dimension of the array B.
+!>    \endverbatim
 !.....
-!     W (workspace/output) REAL(KIND=WP) N-by-N array
-!     On exit, W(1:K,1:K) contains the K computed
-!     eigenvectors of the matrix Rayleigh quotient (real and
-!     imaginary parts for each complex conjugate pair of the
-!     eigenvalues). The Ritz vectors (returned in Z) are the
-!     product of X (containing a POD basis for the input
-!     matrix X) and W. See the descriptions of K, S, X and Z.
-!     W is also used as a workspace to temporarily store the
-!     right singular vectors of X.
+!>    \param[out] W
+!>    \verbatim
+!>    W (workspace/output) REAL(KIND=WP) N-by-N array
+!>    On exit, W(1:K,1:K) contains the K computed
+!>    eigenvectors of the matrix Rayleigh quotient (real and
+!>    imaginary parts for each complex conjugate pair of the
+!>    eigenvalues). The Ritz vectors (returned in Z) are the
+!>    product of X (containing a POD basis for the input
+!>    matrix X) and W. See the descriptions of K, S, X and Z.
+!>    W is also used as a workspace to temporarily store the
+!>    right singular vectors of X.
+!>    \endverbatim
 !.....
-!     LDW (input) INTEGER, LDW >= N
-!     The leading dimension of the array W.
+!>    \param[in] LDW
+!>    \verbatim
+!>    LDW (input) INTEGER, LDW >= N
+!>    The leading dimension of the array W.
+!>    \endverbatim
 !.....
-!     S (workspace/output) REAL(KIND=WP) N-by-N array
-!     The array S(1:K,1:K) is used for the matrix Rayleigh
-!     quotient. This content is overwritten during
-!     the eigenvalue decomposition by DGEEV.
-!     See the description of K.
+!>    \param[out] S
+!>    \verbatim
+!>    S (workspace/output) REAL(KIND=WP) N-by-N array
+!>    The array S(1:K,1:K) is used for the matrix Rayleigh
+!>    quotient. This content is overwritten during
+!>    the eigenvalue decomposition by DGEEV.
+!>    See the description of K.
+!>    \endverbatim
 !.....
-!     LDS (input) INTEGER, LDS >= N
-!     The leading dimension of the array S.
+!>    \param[in] LDS
+!>    \verbatim
+!>    LDS (input) INTEGER, LDS >= N
+!>    The leading dimension of the array S.
+!>    \endverbatim
 !.....
-!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
-!     On exit, WORK(1:N) contains the singular values of
-!     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
-!     If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain
-!     scaling factor WORK(N+2)/WORK(N+1) used to scale X
-!     and Y to avoid overflow in the SVD of X.
-!     This may be of interest if the scaling option is off
-!     and as many as possible smallest eigenvalues are
-!     desired to the highest feasible accuracy.
-!     If the call to DGEDMD is only workspace query, then
-!     WORK(1) contains the minimal workspace length and
-!     WORK(2) is the optimal workspace length. Hence, the
-!     leng of work is at least 2.
-!     See the description of LWORK.
+!>    \param[out] WORK
+!>    \verbatim
+!>    WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!>    On exit, WORK(1:N) contains the singular values of
+!>    X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!>    If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain
+!>    scaling factor WORK(N+2)/WORK(N+1) used to scale X
+!>    and Y to avoid overflow in the SVD of X.
+!>    This may be of interest if the scaling option is off
+!>    and as many as possible smallest eigenvalues are
+!>    desired to the highest feasible accuracy.
+!>    If the call to DGEDMD is only workspace query, then
+!>    WORK(1) contains the minimal workspace length and
+!>    WORK(2) is the optimal workspace length. Hence, the
+!>    leng of work is at least 2.
+!>    See the description of LWORK.
+!>    \endverbatim
 !.....
-!     LWORK (input) INTEGER
-!     The minimal length of the workspace vector WORK.
-!     LWORK is calculated as follows:
-!     If WHTSVD == 1 ::
-!        If JOBZ == 'V', then
-!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)).
-!        If JOBZ == 'N'  then
-!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)).
-!        Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal
-!        workspace length of DGESVD.
-!     If WHTSVD == 2 ::
-!        If JOBZ == 'V', then
-!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N))
-!        If JOBZ == 'N', then
-!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N))
-!        Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the
-!        minimal workspace length of DGESDD.
-!     If WHTSVD == 3 ::
-!        If JOBZ == 'V', then
-!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
-!        If JOBZ == 'N', then
-!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
-!        Here LWORK_SVD = N+M+MAX(3*N+1,
-!                        MAX(1,3*N+M,5*N),MAX(1,N))
-!        is the minimal workspace length of DGESVDQ.
-!     If WHTSVD == 4 ::
-!        If JOBZ == 'V', then
-!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
-!        If JOBZ == 'N', then
-!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
-!        Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the
-!        minimal workspace length of DGEJSV.
-!     The above expressions are not simplified in order to
-!     make the usage of WORK more transparent, and for
-!     easier checking. In any case, LWORK >= 2.
-!     If on entry LWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     and the optimal workspace lengths for both WORK and
-!     IWORK. See the descriptions of WORK and IWORK.
+!>    \param[in] LWORK
+!>    \verbatim
+!>    LWORK (input) INTEGER
+!>    The minimal length of the workspace vector WORK.
+!>    LWORK is calculated as follows:
+!>    If WHTSVD == 1 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)).
+!>       If JOBZ == 'N'  then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)).
+!>       Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal
+!>       workspace length of DGESVD.
+!>    If WHTSVD == 2 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N))
+!>       If JOBZ == 'N', then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N))
+!>       Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the
+!>       minimal workspace length of DGESDD.
+!>    If WHTSVD == 3 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!>       If JOBZ == 'N', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!>       Here LWORK_SVD = N+M+MAX(3*N+1,
+!>                       MAX(1,3*N+M,5*N),MAX(1,N))
+!>       is the minimal workspace length of DGESVDQ.
+!>    If WHTSVD == 4 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!>       If JOBZ == 'N', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!>       Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the
+!>       minimal workspace length of DGEJSV.
+!>    The above expressions are not simplified in order to
+!>    make the usage of WORK more transparent, and for
+!>    easier checking. In any case, LWORK >= 2.
+!>    If on entry LWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths for both WORK and
+!>    IWORK. See the descriptions of WORK and IWORK.
+!>    \endverbatim
 !.....
-!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
-!     Workspace that is required only if WHTSVD equals
-!     2 , 3 or 4. (See the description of WHTSVD).
-!     If on entry LWORK =-1 or LIWORK=-1, then the
-!     minimal length of IWORK is computed and returned in
-!     IWORK(1). See the description of LIWORK.
+!>    \param[out] IWORK
+!>    \verbatim
+!>    IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!>    Workspace that is required only if WHTSVD equals
+!>    2 , 3 or 4. (See the description of WHTSVD).
+!>    If on entry LWORK =-1 or LIWORK=-1, then the
+!>    minimal length of IWORK is computed and returned in
+!>    IWORK(1). See the description of LIWORK.
+!>    \endverbatim
 !.....
-!     LIWORK (input) INTEGER
-!     The minimal length of the workspace vector IWORK.
-!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
-!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
-!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
-!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
-!     If on entry LIWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     and the optimal workspace lengths for both WORK and
-!     IWORK. See the descriptions of WORK and IWORK.
+!>    \param[in] LIWORK
+!>    \verbatim
+!>    LIWORK (input) INTEGER
+!>    The minimal length of the workspace vector IWORK.
+!>    If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!>    If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!>    If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!>    If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!>    If on entry LIWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths for both WORK and
+!>    IWORK. See the descriptions of WORK and IWORK.
+!>    \endverbatim
 !.....
-!     INFO (output) INTEGER
-!     -i < 0 :: On entry, the i-th argument had an
-!               illegal value
-!        = 0 :: Successful return.
-!        = 1 :: Void input. Quick exit (M=0 or N=0).
-!        = 2 :: The SVD computation of X did not converge.
-!               Suggestion: Check the input data and/or
-!               repeat with different WHTSVD.
-!        = 3 :: The computation of the eigenvalues did not
-!               converge.
-!        = 4 :: If data scaling was requested on input and
-!               the procedure found inconsistency in the data
-!               such that for some column index i,
-!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
-!               to zero if JOBS=='C'. The computation proceeds
-!               with original or modified data and warning
-!               flag is set with INFO=4.
+!>    \param[out] INFO
+!>    \verbatim
+!>    INFO (output) INTEGER
+!>    -i < 0 :: On entry, the i-th argument had an
+!>              illegal value
+!>       = 0 :: Successful return.
+!>       = 1 :: Void input. Quick exit (M=0 or N=0).
+!>       = 2 :: The SVD computation of X did not converge.
+!>              Suggestion: Check the input data and/or
+!>              repeat with different WHTSVD.
+!>       = 3 :: The computation of the eigenvalues did not
+!>              converge.
+!>       = 4 :: If data scaling was requested on input and
+!>              the procedure found inconsistency in the data
+!>              such that for some column index i,
+!>              X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!>              to zero if JOBS=='C'. The computation proceeds
+!>              with original or modified data and warning
+!>              flag is set with INFO=4.
+!>    \endverbatim
+!
+!  Authors:
+!  ========
+!
+!> \author Zlatko Drmac
+!
+!> \ingroup gedmd
+!
 !.............................................................
 !.............................................................
+      SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
+                         M, N, X, LDX, Y, LDY, NRNK, TOL,  &
+                         K, REIG,  IMEIG,   Z, LDZ,  RES,  &
+                         B, LDB, W,  LDW,   S, LDS,        &
+                         WORK, LWORK, IWORK, LIWORK, INFO )
+!
+!  -- LAPACK driver routine                                           --
+!
+!  -- LAPACK is a software package provided by University of          --
+!  -- Tennessee, University of California Berkeley, University of     --
+!  -- Colorado Denver and NAG Ltd..                                   --
+!
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real64
+!
+!     Scalar arguments
+!     ~~~~~~~~~~~~~~~~
+      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+                                 NRNK, LDZ, LDB, LDW,  LDS, &
+                                 LWORK,  LIWORK
+      INTEGER,   INTENT(OUT)  :: K, INFO
+      REAL(KIND=WP), INTENT(IN)  :: TOL
+!
+!     Array arguments
+!     ~~~~~~~~~~~~~~~
+      REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+      REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+                                      W(LDW,*), S(LDS,*)
+      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
+                                      RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!
 !     Parameters
 !     ~~~~~~~~~~
       REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
       REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
-
+!
 !     Local scalars
 !     ~~~~~~~~~~~~~
       REAL(KIND=WP) :: OFL,    ROOTSC, SCALE,  SMALL,  &
@@ -432,10 +582,11 @@
                        WNTEX,  WNTREF, WNTRES, WNTVEC
       CHARACTER     :: JOBZL,  T_OR_N
       CHARACTER     :: JSVOPT
-
+!
 !     Local arrays
 !     ~~~~~~~~~~~~
       REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2)
+!
 !     External functions (BLAS and LAPACK)
 !     ~~~~~~~~~~~~~~~~~
       REAL(KIND=WP) DLANGE, DLAMCH, DNRM2
@@ -443,13 +594,13 @@
       INTEGER       IDAMAX
       LOGICAL       DISNAN, LSAME
       EXTERNAL      DISNAN, LSAME
-
+!
 !     External subroutines (BLAS and LAPACK)
 !     ~~~~~~~~~~~~~~~~~~~~
       EXTERNAL      DAXPY,  DGEMM,  DSCAL
       EXTERNAL      DGEEV,  DGEJSV, DGESDD, DGESVD, DGESVDQ, &
                     DLACPY, DLASCL, DLASSQ, XERBLA
-
+!
 !     Intrinsic functions
 !     ~~~~~~~~~~~~~~~~~~~
       INTRINSIC     DBLE, INT, MAX, SQRT
@@ -632,7 +783,8 @@
           K = 0
           DO i = 1, N
             !WORK(i) = DNRM2( M, X(1,i), 1 )
-            SCALE  = ZERO
+            SSUM  = ONE
+            SCALE = ZERO
             CALL DLASSQ( M, X(1,i), 1, SCALE, SSUM )
             IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
                 K    =  0
@@ -705,7 +857,8 @@
           ! carefully computed using DLASSQ.
           DO i = 1, N
             !WORK(i) = DNRM2( M, Y(1,i), 1 )
-            SCALE  = ZERO
+            SSUM  = ONE
+            SCALE = ZERO
             CALL DLASSQ( M, Y(1,i), 1, SCALE, SSUM )
             IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
                 K    =  0
@@ -1051,4 +1204,3 @@
       RETURN
 !     ......
       END SUBROUTINE DGEDMD
-
diff --git a/lapack-netlib/SRC/dgehrd.f b/lapack-netlib/SRC/dgehrd.f
index a40c61cb6..d95bbd182 100644
--- a/lapack-netlib/SRC/dgehrd.f
+++ b/lapack-netlib/SRC/dgehrd.f
@@ -89,7 +89,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (LWORK)
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gehrd
 *
 *> \par Further Details:
 *  =====================
@@ -173,7 +173,7 @@
       INTEGER            IHI, ILO, INFO, LDA, LWORK, N
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION  A( LDA, * ), TAU( * ), WORK( * )
+      DOUBLE PRECISION   A( LDA, * ), TAU( * ), WORK( * )
 *     ..
 *
 *  =====================================================================
@@ -182,7 +182,7 @@
       INTEGER            NBMAX, LDT, TSIZE
       PARAMETER          ( NBMAX = 64, LDT = NBMAX+1,
      $                     TSIZE = LDT*NBMAX )
-      DOUBLE PRECISION  ZERO, ONE
+      DOUBLE PRECISION   ZERO, ONE
       PARAMETER          ( ZERO = 0.0D+0,
      $                     ONE = 1.0D+0 )
 *     ..
@@ -190,7 +190,7 @@
       LOGICAL            LQUERY
       INTEGER            I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB,
      $                   NBMIN, NH, NX
-      DOUBLE PRECISION  EI
+      DOUBLE PRECISION   EI
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DAXPY, DGEHD2, DGEMM, DLAHR2, DLARFB, DTRMM,
@@ -221,12 +221,18 @@
          INFO = -8
       END IF
 *
+      NH = IHI - ILO + 1
       IF( INFO.EQ.0 ) THEN
 *
 *        Compute the workspace requirements
 *
-         NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, -1 ) )
-         LWKOPT = N*NB + TSIZE
+         IF( NH.LE.1 ) THEN
+            LWKOPT = 1
+         ELSE
+            NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI,
+     $                              -1 ) )
+            LWKOPT = N*NB + TSIZE
+         ENDIF
          WORK( 1 ) = LWKOPT
       END IF
 *
@@ -248,7 +254,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -268,7 +273,7 @@
 *
 *           Determine if workspace is large enough for blocked code
 *
-            IF( LWORK.LT.N*NB+TSIZE ) THEN
+            IF( LWORK.LT.LWKOPT ) THEN
 *
 *              Not enough workspace to use optimal NB:  determine the
 *              minimum value of NB, and reduce NB or force use of
@@ -344,6 +349,7 @@
 *     Use unblocked code to reduce the rest of the matrix
 *
       CALL DGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO )
+*
       WORK( 1 ) = LWKOPT
 *
       RETURN
diff --git a/lapack-netlib/SRC/dgelq.f b/lapack-netlib/SRC/dgelq.f
index 013b6c356..255e8732f 100644
--- a/lapack-netlib/SRC/dgelq.f
+++ b/lapack-netlib/SRC/dgelq.f
@@ -98,7 +98,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -166,6 +166,8 @@
 *>  the LQ factorization.
 *> \endverbatim
 *>
+*> \ingroup gelq
+*>
 *  =====================================================================
       SUBROUTINE DGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
diff --git a/lapack-netlib/SRC/dgelqf.f b/lapack-netlib/SRC/dgelqf.f
index ed3372f96..f0eb00a55 100644
--- a/lapack-netlib/SRC/dgelqf.f
+++ b/lapack-netlib/SRC/dgelqf.f
@@ -93,7 +93,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,M).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise.
 *>          For optimum performance LWORK >= M*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -118,7 +119,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gelqf
 *
 *> \par Further Details:
 *  =====================
@@ -174,9 +175,8 @@
 *     Test the input arguments
 *
       INFO = 0
+      K = MIN( M, N )
       NB = ILAENV( 1, 'DGELQF', ' ', M, N, -1, -1 )
-      LWKOPT = M*NB
-      WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -184,19 +184,25 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN
-         INFO = -7
+      ELSE IF( .NOT.LQUERY ) THEN
+         IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
+     $      INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DGELQF', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
+         IF( K.EQ.0 ) THEN
+            LWKOPT = 1
+         ELSE
+            LWKOPT = M*NB
+         END IF
+         WORK( 1 ) = LWKOPT
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
diff --git a/lapack-netlib/SRC/dgelsd.f b/lapack-netlib/SRC/dgelsd.f
index b1f45a2c6..7dc564f48 100644
--- a/lapack-netlib/SRC/dgelsd.f
+++ b/lapack-netlib/SRC/dgelsd.f
@@ -188,7 +188,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEsolve
+*> \ingroup gelsd
 *
 *> \par Contributors:
 *  ==================
@@ -228,7 +228,7 @@
       DOUBLE PRECISION   ANRM, BIGNUM, BNRM, EPS, SFMIN, SMLNUM
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DGEBRD, DGELQF, DGEQRF, DLABAD, DLACPY, DLALSD,
+      EXTERNAL           DGEBRD, DGELQF, DGEQRF, DLACPY, DLALSD,
      $                   DLASCL, DLASET, DORMBR, DORMLQ, DORMQR, XERBLA
 *     ..
 *     .. External Functions ..
@@ -276,7 +276,7 @@
      $       LOG( TWO ) ) + 1, 0 )
 *
       IF( INFO.EQ.0 ) THEN
-         MAXWRK = 0
+         MAXWRK = 1
          LIWORK = 3*MINMN*NLVL + 11*MINMN
          MM = M
          IF( M.GE.N .AND. M.GE.MNTHR ) THEN
@@ -372,7 +372,6 @@
       SFMIN = DLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max entry outside range [SMLNUM,BIGNUM].
 *
diff --git a/lapack-netlib/SRC/dgemlq.f b/lapack-netlib/SRC/dgemlq.f
index 3ba209105..757683f46 100644
--- a/lapack-netlib/SRC/dgemlq.f
+++ b/lapack-netlib/SRC/dgemlq.f
@@ -111,16 +111,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -144,7 +145,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -160,11 +161,13 @@
 *>  block sizes MB and NB returned by ILAENV, DGELQ will use either
 *>  DLASWLQ (if the matrix is wide-and-short) or DGELQT to compute
 *>  the LQ factorization.
-*>  This version of DGEMLQ will use either DLAMSWLQ or DGEMLQT to 
+*>  This version of DGEMLQ will use either DLAMSWLQ or DGEMLQT to
 *>  multiply matrix Q by another matrix.
 *>  Further Details in DLAMSWLQ or DGEMLQT.
 *> \endverbatim
 *>
+*> \ingroup gemlq
+*>
 *  =====================================================================
       SUBROUTINE DGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -186,7 +189,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -202,7 +205,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -217,6 +220,13 @@
         LW = M * MB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN
@@ -245,12 +255,12 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = LW
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -262,7 +272,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -275,7 +285,7 @@
      $                 MB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = LW
+      WORK( 1 ) = LWMIN
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/dgemqr.f b/lapack-netlib/SRC/dgemqr.f
index 022cf21e4..608815483 100644
--- a/lapack-netlib/SRC/dgemqr.f
+++ b/lapack-netlib/SRC/dgemqr.f
@@ -111,16 +111,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -144,7 +145,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -160,12 +161,14 @@
 *>  block sizes MB and NB returned by ILAENV, DGEQR will use either
 *>  DLATSQR (if the matrix is tall-and-skinny) or DGEQRT to compute
 *>  the QR factorization.
-*>  This version of DGEMQR will use either DLAMTSQR or DGEMQRT to 
+*>  This version of DGEMQR will use either DLAMTSQR or DGEMQRT to
 *>  multiply matrix Q by another matrix.
 *>  Further Details in DLATMSQR or DGEMQRT.
 *>
 *> \endverbatim
 *>
+*> \ingroup gemqr
+*>
 *  =====================================================================
       SUBROUTINE DGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -187,7 +190,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -203,7 +206,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -218,6 +221,13 @@
         LW = MB * NB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, MB - K ).EQ.0 ) THEN
@@ -246,12 +256,12 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = LW
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -263,7 +273,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -276,7 +286,7 @@
      $                 NB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = LW
+      WORK( 1 ) = LWMIN
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/dgeqlf.f b/lapack-netlib/SRC/dgeqlf.f
index b8ac0b1a0..a72d9dc76 100644
--- a/lapack-netlib/SRC/dgeqlf.f
+++ b/lapack-netlib/SRC/dgeqlf.f
@@ -88,7 +88,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -113,7 +114,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup geqlf
 *
 *> \par Further Details:
 *  =====================
@@ -188,8 +189,9 @@
          END IF
          WORK( 1 ) = LWKOPT
 *
-         IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
-            INFO = -7
+         IF( .NOT.LQUERY ) THEN
+            IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) )
+     $         INFO = -7
          END IF
       END IF
 *
diff --git a/lapack-netlib/SRC/dgeqp3rk.c b/lapack-netlib/SRC/dgeqp3rk.c
new file mode 100644
index 000000000..17a78dd5a
--- /dev/null
+++ b/lapack-netlib/SRC/dgeqp3rk.c
@@ -0,0 +1,1059 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static integer c__3 = 3;
+static integer c__2 = 2;
+
+/* Subroutine */ int dgeqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*kmax, doublereal *abstol, doublereal *reltol, doublereal *a, integer 
+	*lda, integer *k, doublereal *maxc2nrmk, doublereal *relmaxc2nrmk, 
+	integer *jpiv, doublereal *tau, doublereal *work, integer *lwork, 
+	integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    extern /* Subroutine */ int dlaqp2rk_(integer *, integer *, integer *, 
+	    integer *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, doublereal *, integer *, integer *, doublereal *, 
+	    doublereal *, integer *, doublereal *, doublereal *, doublereal *,
+	     doublereal *, integer *), dlaqp3rk_(integer *, integer *, 
+	    integer *, integer *, integer *, doublereal *, doublereal *, 
+	    integer *, doublereal *, doublereal *, integer *, logical *, 
+	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+	    doublereal *, doublereal *, doublereal *, doublereal *, integer *,
+	     integer *, integer *);
+    doublereal maxc2nrm;
+    logical done;
+    integer jmax;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    integer j, jmaxc2nrm, jmaxb, nbmin, iinfo, n_sub__, minmn;
+    doublereal myhugeval;
+    integer jb, nb, kf;
+    extern doublereal dlamch_(char *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    integer nx;
+    doublereal safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern logical disnan_(doublereal *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen);
+    integer kp1, lwkopt;
+    logical lquery;
+    integer jbf;
+    doublereal eps;
+    integer iws, ioffset;
+
+
+/*  -- LAPACK computational routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test input arguments */
+/*     ==================== */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*kmax < 0) {
+	*info = -4;
+    } else if (disnan_(abstol)) {
+	*info = -5;
+    } else if (disnan_(reltol)) {
+	*info = -6;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -8;
+    }
+
+/*     If the input parameters M, N, NRHS, KMAX, LDA are valid: */
+/*       a) Test the input workspace size LWORK for the minimum */
+/*          size requirement IWS. */
+/*       b) Determine the optimal block size NB and optimal */
+/*          workspace size LWKOPT to be returned in WORK(1) */
+/*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., */
+/*          (3) when routine exits. */
+/*     Here, IWS is the miminum workspace required for unblocked */
+/*     code. */
+
+    if (*info == 0) {
+	minmn = f2cmin(*m,*n);
+	if (minmn == 0) {
+	    iws = 1;
+	    lwkopt = 1;
+	} else {
+
+/*           Minimal workspace size in case of using only unblocked */
+/*           BLAS 2 code in DLAQP2RK. */
+/*           1) DGEQP3RK and DLAQP2RK: 2*N to store full and partial */
+/*              column 2-norms. */
+/*           2) DLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in DLARF subroutine inside DLAQP2RK to apply an */
+/*              elementary reflector from the left. */
+/*           TOTAL_WORK_SIZE = 3*N + NRHS - 1 */
+
+	    iws = *n * 3 + *nrhs - 1;
+
+/*           Assign to NB optimal block size. */
+
+	    nb = ilaenv_(&c__1, "DGEQP3RK", " ", m, n, &c_n1, &c_n1, (ftnlen)
+		    8, (ftnlen)1);
+
+/*           A formula for the optimal workspace size in case of using */
+/*           both unblocked BLAS 2 in DLAQP2RK and blocked BLAS 3 code */
+/*           in DLAQP3RK. */
+/*           1) DGEQP3RK, DLAQP2RK, DLAQP3RK: 2*N to store full and */
+/*              partial column 2-norms. */
+/*           2) DLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in DLARF subroutine to apply an elementary reflector */
+/*              from the left. */
+/*           3) DLAQP3RK: NB*(N+NRHS) to use in the work array F that */
+/*              is used to apply a block reflector from */
+/*              the left. */
+/*           4) DLAQP3RK: NB to use in the auxilixary array AUX. */
+/*           Sizes (2) and ((3) + (4)) should intersect, therefore */
+/*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. */
+
+	    lwkopt = (*n << 1) + nb * (*n + *nrhs + 1);
+	}
+	work[1] = (doublereal) lwkopt;
+
+	if (*lwork < iws && ! lquery) {
+	    *info = -15;
+	}
+    }
+
+/*      NOTE: The optimal workspace size is returned in WORK(1), if */
+/*            the input parameters M, N, NRHS, KMAX, LDA are valid. */
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEQP3RK", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible for M=0 or N=0. */
+
+    if (minmn == 0) {
+	*k = 0;
+	*maxc2nrmk = 0.;
+	*relmaxc2nrmk = 0.;
+	work[1] = (doublereal) lwkopt;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize column pivot array JPIV. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	jpiv[j] = j;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize storage for partial and exact column 2-norms. */
+/*     a) The elements WORK(1:N) are used to store partial column */
+/*        2-norms of the matrix A, and may decrease in each computation */
+/*        step; initialize to the values of complete columns 2-norms. */
+/*     b) The elements WORK(N+1:2*N) are used to store complete column */
+/*        2-norms of the matrix A, they are not changed during the */
+/*        computation; initialize the values of complete columns 2-norms. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	work[j] = dnrm2_(m, &a[j * a_dim1 + 1], &c__1);
+	work[*n + j] = work[j];
+    }
+
+/*     ================================================================== */
+
+/*     Compute the pivot column index and the maximum column 2-norm */
+/*     for the whole original matrix stored in A(1:M,1:N). */
+
+    kp1 = idamax_(n, &work[1], &c__1);
+    maxc2nrm = work[kp1];
+
+/*     ==================================================================. */
+
+    if (disnan_(&maxc2nrm)) {
+
+/*        Check if the matrix A contains NaN, set INFO parameter */
+/*        to the column number where the first NaN is found and return */
+/*        from the routine. */
+
+	*k = 0;
+	*info = kp1;
+
+/*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = maxc2nrm;
+
+/*        Array TAU is not set and contains undefined elements. */
+
+	work[1] = (doublereal) lwkopt;
+	return 0;
+    }
+
+/*     =================================================================== */
+
+    if (maxc2nrm == 0.) {
+
+/*        Check is the matrix A is a zero matrix, set array TAU and */
+/*        return from the routine. */
+
+	*k = 0;
+	*maxc2nrmk = 0.;
+	*relmaxc2nrmk = 0.;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    tau[j] = 0.;
+	}
+
+	work[1] = (doublereal) lwkopt;
+	return 0;
+
+    }
+
+/*     =================================================================== */
+
+    myhugeval = dlamch_("Overflow");
+
+    if (maxc2nrm > myhugeval) {
+
+/*        Check if the matrix A contains +Inf or -Inf, set INFO parameter */
+/*        to the column number, where the first +/-Inf  is found plus N, */
+/*        and continue the computation. */
+
+	*info = *n + kp1;
+
+    }
+
+/*     ================================================================== */
+
+/*     Quick return if possible for the case when the first */
+/*     stopping criterion is satisfied, i.e. KMAX = 0. */
+
+    if (*kmax == 0) {
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.;
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    tau[j] = 0.;
+	}
+	work[1] = (doublereal) lwkopt;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+    eps = dlamch_("Epsilon");
+
+/*     Adjust ABSTOL */
+
+    if (*abstol >= 0.) {
+	safmin = dlamch_("Safe minimum");
+/* Computing MAX */
+	d__1 = *abstol, d__2 = safmin * 2.;
+	*abstol = f2cmax(d__1,d__2);
+    }
+
+/*     Adjust RELTOL */
+
+    if (*reltol >= 0.) {
+	*reltol = f2cmax(*reltol,eps);
+    }
+
+/*     =================================================================== */
+
+/*     JMAX is the maximum index of the column to be factorized, */
+/*     which is also limited by the first stopping criterion KMAX. */
+
+    jmax = f2cmin(*kmax,minmn);
+
+/*     =================================================================== */
+
+/*     Quick return if possible for the case when the second or third */
+/*     stopping criterion for the whole original matrix is satified, */
+/*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */
+/*     (which is ONE <= RELTOL). */
+
+    if (maxc2nrm <= *abstol || 1. <= *reltol) {
+
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    tau[j] = 0.;
+	}
+
+	work[1] = (doublereal) lwkopt;
+	return 0;
+    }
+
+/*     ================================================================== */
+/*     Factorize columns */
+/*     ================================================================== */
+
+/*     Determine the block size. */
+
+    nbmin = 2;
+    nx = 0;
+
+    if (nb > 1 && nb < minmn) {
+
+/*        Determine when to cross over from blocked to unblocked code. */
+/*        (for N less than NX, unblocked code should be used). */
+
+/* Computing MAX */
+	i__1 = 0, i__2 = ilaenv_(&c__3, "DGEQP3RK", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)8, (ftnlen)1);
+	nx = f2cmax(i__1,i__2);
+
+	if (nx < minmn) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    if (*lwork < lwkopt) {
+
+/*              Not enough workspace to use optimal block size that */
+/*              is currently stored in NB. */
+/*              Reduce NB and determine the minimum value of NB. */
+
+		nb = (*lwork - (*n << 1)) / (*n + 1);
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "DGEQP3RK", " ", m, n, &c_n1, 
+			&c_n1, (ftnlen)8, (ftnlen)1);
+		nbmin = f2cmax(i__1,i__2);
+
+	    }
+	}
+    }
+
+/*     ================================================================== */
+
+/*     DONE is the boolean flag to rerpresent the case when the */
+/*     factorization completed in the block factorization routine, */
+/*     before the end of the block. */
+
+    done = FALSE_;
+
+/*     J is the column index. */
+
+    j = 1;
+
+/*     (1) Use blocked code initially. */
+
+/*     JMAXB is the maximum column index of the block, when the */
+/*     blocked code is used, is also limited by the first stopping */
+/*     criterion KMAX. */
+
+/* Computing MIN */
+    i__1 = *kmax, i__2 = minmn - nx;
+    jmaxb = f2cmin(i__1,i__2);
+
+    if (nb >= nbmin && nb < jmax && jmaxb > 0) {
+
+/*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */
+/*        J   is the column index of a column block; */
+/*        JB  is the column block size to pass to block factorization */
+/*            routine in a loop step; */
+/*        JBF is the number of columns that were actually factorized */
+/*            that was returned by the block factorization routine */
+/*            in a loop step, JBF <= JB; */
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	while(j <= jmaxb) {
+
+/* Computing MIN */
+	    i__1 = nb, i__2 = jmaxb - j + 1;
+	    jb = f2cmin(i__1,i__2);
+	    n_sub__ = *n - j + 1;
+	    ioffset = j - 1;
+
+/*           Factorize JB columns among the columns A(J:N). */
+
+	    i__1 = *n + *nrhs - j + 1;
+	    dlaqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, 
+		    &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, 
+		    maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], &
+		    work[*n + j], &work[(*n << 1) + 1], &work[(*n << 1) + jb 
+		    + 1], &i__1, &iwork[1], &iinfo);
+
+/*           Set INFO on the first occurence of Inf. */
+
+	    if (iinfo > n_sub__ && *info == 0) {
+		*info = (ioffset << 1) + iinfo;
+	    }
+
+	    if (done) {
+
+/*              Either the submatrix is zero before the end of the */
+/*              column block, or ABSTOL or RELTOL criterion is */
+/*              satisfied before the end of the column block, we can */
+/*              return from the routine. Perform the following before */
+/*              returning: */
+/*                a) Set the number of factorized columns K, */
+/*                   K = IOFFSET + JBF from the last call of blocked */
+/*                   routine. */
+/*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */
+/*                         by the block factorization routine; */
+/*                      2) The remaining TAUs are set to ZERO by the */
+/*                         block factorization routine. */
+
+		*k = ioffset + jbf;
+
+/*              Set INFO on the first occurrence of NaN, NaN takes */
+/*              prcedence over Inf. */
+
+		if (iinfo <= n_sub__ && iinfo > 0) {
+		    *info = ioffset + iinfo;
+		}
+
+/*              Return from the routine. */
+
+		work[1] = (doublereal) lwkopt;
+
+		return 0;
+
+	    }
+
+	    j += jbf;
+
+	}
+
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+/*     J = JMAX+1 means we factorized the maximum possible number of */
+/*     columns, that is in ELSE clause we need to compute */
+/*     the MAXC2NORM and RELMAXC2NORM to return after we processed */
+/*     the blocks. */
+
+    if (j <= jmax) {
+
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	n_sub__ = *n - j + 1;
+	ioffset = j - 1;
+
+	i__1 = jmax - j + 1;
+	dlaqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, &
+		maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, 
+		relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], &work[*n + j], &
+		work[(*n << 1) + 1], &iinfo);
+
+/*        ABSTOL or RELTOL criterion is satisfied when the number of */
+/*        the factorized columns KF is smaller then the  number */
+/*        of columns JMAX-J+1 supplied to be factorized by the */
+/*        unblocked routine, we can return from */
+/*        the routine. Perform the following before returning: */
+/*           a) Set the number of factorized columns K, */
+/*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the */
+/*              unblocked factorization routine above. */
+
+	*k = j - 1 + kf;
+
+/*        Set INFO on the first exception occurence. */
+
+/*        Set INFO on the first exception occurence of Inf or NaN, */
+/*        (NaN takes precedence over Inf). */
+
+	if (iinfo > n_sub__ && *info == 0) {
+	    *info = (ioffset << 1) + iinfo;
+	} else if (iinfo <= n_sub__ && iinfo > 0) {
+	    *info = ioffset + iinfo;
+	}
+
+    } else {
+
+/*        Compute the return values for blocked code. */
+
+/*        Set the number of factorized columns if the unblocked routine */
+/*        was not called. */
+
+	*k = jmax;
+
+/*        If there exits a residual matrix after the blocked code: */
+/*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */
+/*              residual matrix, otherwise set them to ZERO; */
+/*           2) Set TAU(K+1:MINMN) to ZERO. */
+
+	if (*k < minmn) {
+	    i__1 = *n - *k;
+	    jmaxc2nrm = *k + idamax_(&i__1, &work[*k + 1], &c__1);
+	    *maxc2nrmk = work[jmaxc2nrm];
+	    if (*k == 0) {
+		*relmaxc2nrmk = 1.;
+	    } else {
+		*relmaxc2nrmk = *maxc2nrmk / maxc2nrm;
+	    }
+
+	    i__1 = minmn;
+	    for (j = *k + 1; j <= i__1; ++j) {
+		tau[j] = 0.;
+	    }
+
+	}
+
+/*     END IF( J.LE.JMAX ) THEN */
+
+    }
+
+    work[1] = (doublereal) lwkopt;
+
+    return 0;
+
+/*     End of DGEQP3RK */
+
+} /* dgeqp3rk_ */
+
diff --git a/lapack-netlib/SRC/dgeqp3rk.f b/lapack-netlib/SRC/dgeqp3rk.f
new file mode 100644
index 000000000..b8e41b39c
--- /dev/null
+++ b/lapack-netlib/SRC/dgeqp3rk.f
@@ -0,0 +1,1082 @@
+*> \brief \b DGEQP3RK computes a truncated Householder QR factorization with column pivoting of a real m-by-n matrix A by using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DGEQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgeqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgeqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgeqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+*      $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*      $                     WORK, LWORK, IWORK, INFO )
+*       IMPLICIT NONE
+*
+*      .. Scalar Arguments ..
+*       INTEGER            INFO, K, KMAX, LDA, LWORK, M, N, NRHS
+*       DOUBLE PRECISION   ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*      ..
+*      .. Array Arguments ..
+*       INTEGER            IWORK( * ), JPIV( * )
+*       DOUBLE PRECISION   A( LDA, * ), TAU( * ), WORK( * )
+*      ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DGEQP3RK performs two tasks simultaneously:
+*>
+*> Task 1: The routine computes a truncated (rank K) or full rank
+*> Householder QR factorization with column pivoting of a real
+*> M-by-N matrix A using Level 3 BLAS. K is the number of columns
+*> that were factorized, i.e. factorization rank of the
+*> factor R, K <= min(M,N).
+*>
+*>  A * P(K) = Q(K) * R(K)  =
+*>
+*>        = Q(K) * ( R11(K) R12(K) ) = Q(K) * (   R(K)_approx    )
+*>                 ( 0      R22(K) )          ( 0  R(K)_residual ),
+*>
+*> where:
+*>
+*>  P(K)            is an N-by-N permutation matrix;
+*>  Q(K)            is an M-by-M orthogonal matrix;
+*>  R(K)_approx   = ( R11(K), R12(K) ) is a rank K approximation of the
+*>                    full rank factor R with K-by-K upper-triangular
+*>                    R11(K) and K-by-N rectangular R12(K). The diagonal
+*>                    entries of R11(K) appear in non-increasing order
+*>                    of absolute value, and absolute values of all of
+*>                    them exceed the maximum column 2-norm of R22(K)
+*>                    up to roundoff error.
+*>  R(K)_residual = R22(K) is the residual of a rank K approximation
+*>                    of the full rank factor R. It is a
+*>                    an (M-K)-by-(N-K) rectangular matrix;
+*>  0               is a an (M-K)-by-K zero matrix.
+*>
+*> Task 2: At the same time, the routine overwrites a real M-by-NRHS
+*> matrix B with  Q(K)**T * B  using Level 3 BLAS.
+*>
+*> =====================================================================
+*>
+*> The matrices A and B are stored on input in the array A as
+*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS)
+*> respectively.
+*>
+*>                                  N     NRHS
+*>             array_A   =   M  [ mat_A, mat_B ]
+*>
+*> The truncation criteria (i.e. when to stop the factorization)
+*> can be any of the following:
+*>
+*>   1) The input parameter KMAX, the maximum number of columns
+*>      KMAX to factorize, i.e. the factorization rank is limited
+*>      to KMAX. If KMAX >= min(M,N), the criterion is not used.
+*>
+*>   2) The input parameter ABSTOL, the absolute tolerance for
+*>      the maximum column 2-norm of the residual matrix R22(K). This
+*>      means that the factorization stops if this norm is less or
+*>      equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used.
+*>
+*>   3) The input parameter RELTOL, the tolerance for the maximum
+*>      column 2-norm matrix of the residual matrix R22(K) divided
+*>      by the maximum column 2-norm of the original matrix A, which
+*>      is equal to abs(R(1,1)). This means that the factorization stops
+*>      when the ratio of the maximum column 2-norm of R22(K) to
+*>      the maximum column 2-norm of A is less than or equal to RELTOL.
+*>      If RELTOL < 0.0, the criterion is not used.
+*>
+*>   4) In case both stopping criteria ABSTOL or RELTOL are not used,
+*>      and when the residual matrix R22(K) is a zero matrix in some
+*>      factorization step K. ( This stopping criterion is implicit. )
+*>
+*>  The algorithm stops when any of these conditions is first
+*>  satisfied, otherwise the whole matrix A is factorized.
+*>
+*>  To factorize the whole matrix A, use the values
+*>  KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0.
+*>
+*>  The routine returns:
+*>     a) Q(K), R(K)_approx = ( R11(K), R12(K) ),
+*>        R(K)_residual = R22(K), P(K), i.e. the resulting matrices
+*>        of the factorization; P(K) is represented by JPIV,
+*>        ( if K = min(M,N), R(K)_approx is the full factor R,
+*>        and there is no residual matrix R(K)_residual);
+*>     b) K, the number of columns that were factorized,
+*>        i.e. factorization rank;
+*>     c) MAXC2NRMK, the maximum column 2-norm of the residual
+*>        matrix R(K)_residual = R22(K),
+*>        ( if K = min(M,N), MAXC2NRMK = 0.0 );
+*>     d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum
+*>        column 2-norm of the original matrix A, which is equal
+*>        to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 );
+*>     e) Q(K)**T * B, the matrix B with the orthogonal
+*>        transformation Q(K)**T applied on the left.
+*>
+*> The N-by-N permutation matrix P(K) is stored in a compact form in
+*> the integer array JPIV. For 1 <= j <= N, column j
+*> of the matrix A was interchanged with column JPIV(j).
+*>
+*> The M-by-M orthogonal matrix Q is represented as a product
+*> of elementary Householder reflectors
+*>
+*>     Q(K) = H(1) *  H(2) * . . . * H(K),
+*>
+*> where K is the number of columns that were factorized.
+*>
+*> Each H(j) has the form
+*>
+*>     H(j) = I - tau * v * v**T,
+*>
+*> where 1 <= j <= K and
+*>   I    is an M-by-M identity matrix,
+*>   tau  is a real scalar,
+*>   v    is a real vector with v(1:j-1) = 0 and v(j) = 1.
+*>
+*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j).
+*>
+*> See the Further Details section for more information.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e. the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M,N), then this stopping criterion
+*>                is not used, the routine factorizes columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>                satisfied on input and the routine exits immediately.
+*>                This means that the factorization is not performed,
+*>                the matrices A and B are not modified, and
+*>                the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION
+*>
+*>          The second factorization stopping criterion, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix R22(K).
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix R22(K)
+*>          is less than or equal to ABSTOL. Let SAFMIN = DLAMCH('S').
+*>
+*>          a) If ABSTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -5 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN
+*>                is used. This includes the case ABSTOL = -0.0.
+*>
+*>          d) If 2*SAFMIN <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If ABSTOL chosen above is >= MAXC2NRM, then this
+*>          stopping criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed. The routine
+*>          returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case ABSTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION
+*>
+*>          The third factorization stopping criterion, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio
+*>          abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of
+*>          the residual matrix R22(K) to the maximum column 2-norm of
+*>          the original matrix A. The algorithm converges (stops the
+*>          factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less
+*>          than or equal to RELTOL. Let EPS = DLAMCH('E').
+*>
+*>          a) If RELTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -6 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used.
+*>                This includes the case RELTOL = -0.0.
+*>
+*>          d) If EPS <= RELTOL then the input value of RELTOL
+*>                is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If RELTOL chosen above is >= 1.0, then this stopping
+*>          criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed.
+*>          The routine returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case RELTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*>
+*>          NOTE: We recommend that RELTOL satisfy
+*>                min( max(M,N)*EPS, sqrt(EPS) ) <= RELTOL
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N+NRHS)
+*>
+*>          On entry:
+*>
+*>          a) The subarray A(1:M,1:N) contains the M-by-N matrix A.
+*>          b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS
+*>             matrix B.
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>
+*>          a) The subarray A(1:M,1:N) contains parts of the factors
+*>             of the matrix A:
+*>
+*>            1) If K = 0, A(1:M,1:N) contains the original matrix A.
+*>            2) If K > 0, A(1:M,1:N) contains parts of the
+*>            factors:
+*>
+*>              1. The elements below the diagonal of the subarray
+*>                 A(1:M,1:K) together with TAU(1:K) represent the
+*>                 orthogonal matrix Q(K) as a product of K Householder
+*>                 elementary reflectors.
+*>
+*>              2. The elements on and above the diagonal of
+*>                 the subarray A(1:K,1:N) contain K-by-N
+*>                 upper-trapezoidal matrix
+*>                 R(K)_approx = ( R11(K), R12(K) ).
+*>                 NOTE: If K=min(M,N), i.e. full rank factorization,
+*>                       then R_approx(K) is the full factor R which
+*>                       is upper-trapezoidal. If, in addition, M>=N,
+*>                       then R is upper-triangular.
+*>
+*>              3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K)
+*>                 rectangular matrix R(K)_residual = R22(K).
+*>
+*>          b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains
+*>             the M-by-NRHS product Q(K)**T * B.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*>          This is the leading dimension for both matrices, A and B.
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*>
+*>          NOTE: If K = 0, a) the arrays A and B are not modified;
+*>                          b) the array TAU(1:min(M,N)) is set to ZERO,
+*>                             if the matrix A does not contain NaN,
+*>                             otherwise the elements TAU(1:min(M,N))
+*>                             are undefined;
+*>                          c) the elements of the array JPIV are set
+*>                             as follows: for j = 1:N, JPIV(j) = j.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix R22(K),
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then MAXC2NRMK equals the maximum column 2-norm
+*>             of the original matrix A.
+*>
+*>          b) If 0 < K < min(M,N), then MAXC2NRMK is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then MAXC2NRMK = 0.0.
+*>
+*>          NOTE: MAXC2NRMK in the factorization step K would equal
+*>                R(K+1,K+1) in the next factorization step K+1.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix R22(K) (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then RELMAXC2NRMK = 1.0.
+*>
+*>          b) If 0 < K < min(M,N), then
+*>                RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then RELMAXC2NRMK = 0.0.
+*>
+*>         NOTE: RELMAXC2NRMK in the factorization step K would equal
+*>               abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization
+*>               step K+1.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices. For 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*>
+*>          The elements of the array JPIV(1:N) are always set
+*>          by the routine, for example, even  when no columns
+*>          were factorized, i.e. when K = 0, the elements are
+*>          set as JPIV(j) = j for j = 1:N.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is DOUBLE PRECISION array, dimension (min(M,N))
+*>          The scalar factors of the elementary reflectors.
+*>
+*>          If 0 < K <= min(M,N), only the elements TAU(1:K) of
+*>          the array TAU are modified by the factorization.
+*>          After the factorization computed, if no NaN was found
+*>          during the factorization, the remaining elements
+*>          TAU(K+1:min(M,N)) are set to zero, otherwise the
+*>          elements TAU(K+1:min(M,N)) are not set and therefore
+*>          undefined.
+*>          ( If K = 0, all elements of TAU are set to zero, if
+*>          the matrix A does not contain NaN. )
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and
+*>          LWORK >= (3*N+NRHS-1), otherwise.
+*>          For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )),
+*>          where NB is the optimal block size for DGEQP3RK returned
+*>          by ILAENV. Minimal block size MINNB=2.
+*>
+*>          NOTE: The decision, whether to use unblocked BLAS 2
+*>          or blocked BLAS 3 code is based not only on the dimension
+*>          LWORK of the availbale workspace WORK, but also also on the
+*>          matrix A dimension N via crossover point NX returned
+*>          by ILAENV. (For N less than NX, unblocked code should be
+*>          used.)
+*>
+*>          If LWORK = -1, then a workspace query is assumed;
+*>          the routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix in the blocked step auxiliary subroutine DLAQP3RK ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) INFO < 0: if INFO = -i, the i-th argument had an
+*>                       illegal value.
+*>          3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup geqp3rk
+*
+*> \par Further Details:
+*  =====================
+*
+*> \verbatim
+*> DGEQP3RK is based on the same BLAS3 Householder QR factorization
+*> algorithm with column pivoting as in DGEQP3 routine which uses
+*> DLARFG routine to generate Householder reflectors
+*> for QR factorization.
+*>
+*> We can also write:
+*>
+*>   A = A_approx(K) + A_residual(K)
+*>
+*> The low rank approximation matrix A(K)_approx from
+*> the truncated QR factorization of rank K of the matrix A is:
+*>
+*>   A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T
+*>                        (     0     0 )
+*>
+*>               = Q(K) * ( R11(K) R12(K) ) * P(K)**T
+*>                        (      0      0 )
+*>
+*> The residual A_residual(K) of the matrix A is:
+*>
+*>   A_residual(K) = Q(K) * ( 0              0 ) * P(K)**T =
+*>                          ( 0  R(K)_residual )
+*>
+*>                 = Q(K) * ( 0        0 ) * P(K)**T
+*>                          ( 0   R22(K) )
+*>
+*> The truncated (rank K) factorization guarantees that
+*> the maximum column 2-norm of A_residual(K) is less than
+*> or equal to MAXC2NRMK up to roundoff error.
+*>
+*> NOTE: An approximation of the null vectors
+*>       of A can be easily computed from R11(K)
+*>       and R12(K):
+*>
+*>       Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) )
+*>                                 (         -I           )
+*>
+*> \endverbatim
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+     $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     WORK, LWORK, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS
+      DOUBLE PRECISION   ABSTOL,  MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      DOUBLE PRECISION   A( LDA, * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            INB, INBMIN, IXOVER
+      PARAMETER          ( INB = 1, INBMIN = 2, IXOVER = 3 )
+      DOUBLE PRECISION   ZERO, ONE, TWO
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, DONE
+      INTEGER            IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX,
+     $                   JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB,
+     $                   NBMIN, NX
+      DOUBLE PRECISION   EPS, HUGEVAL, MAXC2NRM, SAFMIN
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLAQP2RK, DLAQP3RK, XERBLA
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      INTEGER            IDAMAX, ILAENV
+      DOUBLE PRECISION   DLAMCH, DNRM2
+      EXTERNAL           DISNAN, DLAMCH, DNRM2, IDAMAX, ILAENV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test input arguments
+*     ====================
+*
+      INFO = 0
+      LQUERY = ( LWORK.EQ.-1 )
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KMAX.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( DISNAN( ABSTOL ) ) THEN
+         INFO = -5
+      ELSE IF( DISNAN( RELTOL ) ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -8
+      END IF
+*
+*     If the input parameters M, N, NRHS, KMAX, LDA are valid:
+*       a) Test the input workspace size LWORK for the minimum
+*          size requirement IWS.
+*       b) Determine the optimal block size NB and optimal
+*          workspace size LWKOPT to be returned in WORK(1)
+*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE.,
+*          (3) when routine exits.
+*     Here, IWS is the miminum workspace required for unblocked
+*     code.
+*
+      IF( INFO.EQ.0 ) THEN
+         MINMN = MIN( M, N )
+         IF( MINMN.EQ.0 ) THEN
+            IWS = 1
+            LWKOPT = 1
+         ELSE
+*
+*           Minimal workspace size in case of using only unblocked
+*           BLAS 2 code in DLAQP2RK.
+*           1) DGEQP3RK and DLAQP2RK: 2*N to store full and partial
+*              column 2-norms.
+*           2) DLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in DLARF subroutine inside DLAQP2RK to apply an
+*              elementary reflector from the left.
+*           TOTAL_WORK_SIZE = 3*N + NRHS - 1
+*
+            IWS = 3*N + NRHS - 1
+*
+*           Assign to NB optimal block size.
+*
+            NB = ILAENV( INB, 'DGEQP3RK', ' ', M, N, -1, -1 )
+*
+*           A formula for the optimal workspace size in case of using
+*           both unblocked BLAS 2 in DLAQP2RK and blocked BLAS 3 code
+*           in DLAQP3RK.
+*           1) DGEQP3RK, DLAQP2RK, DLAQP3RK: 2*N to store full and
+*              partial column 2-norms.
+*           2) DLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in DLARF subroutine to apply an elementary reflector
+*              from the left.
+*           3) DLAQP3RK: NB*(N+NRHS) to use in the work array F that
+*              is used to apply a block reflector from
+*              the left.
+*           4) DLAQP3RK: NB to use in the auxilixary array AUX.
+*           Sizes (2) and ((3) + (4)) should intersect, therefore
+*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2.
+*
+            LWKOPT = 2*N + NB*( N+NRHS+1 )
+         END IF
+         WORK( 1 ) = DBLE( LWKOPT )
+*
+         IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+*      NOTE: The optimal workspace size is returned in WORK(1), if
+*            the input parameters M, N, NRHS, KMAX, LDA are valid.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DGEQP3RK', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible for M=0 or N=0.
+*
+      IF( MINMN.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+         WORK( 1 ) = DBLE( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+*     Initialize column pivot array JPIV.
+*
+      DO J = 1, N
+         JPIV( J ) = J
+      END DO
+*
+*     ==================================================================
+*
+*     Initialize storage for partial and exact column 2-norms.
+*     a) The elements WORK(1:N) are used to store partial column
+*        2-norms of the matrix A, and may decrease in each computation
+*        step; initialize to the values of complete columns 2-norms.
+*     b) The elements WORK(N+1:2*N) are used to store complete column
+*        2-norms of the matrix A, they are not changed during the
+*        computation; initialize the values of complete columns 2-norms.
+*
+      DO J = 1, N
+         WORK( J ) = DNRM2( M, A( 1, J ), 1 )
+         WORK( N+J ) = WORK( J )
+      END DO
+*
+*     ==================================================================
+*
+*     Compute the pivot column index and the maximum column 2-norm
+*     for the whole original matrix stored in A(1:M,1:N).
+*
+      KP1 = IDAMAX( N, WORK( 1 ), 1 )
+      MAXC2NRM = WORK( KP1 )
+*
+*     ==================================================================.
+*
+      IF( DISNAN( MAXC2NRM ) ) THEN
+*
+*        Check if the matrix A contains NaN, set INFO parameter
+*        to the column number where the first NaN is found and return
+*        from the routine.
+*
+         K = 0
+         INFO = KP1
+*
+*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = MAXC2NRM
+*
+*        Array TAU is not set and contains undefined elements.
+*
+         WORK( 1 ) = DBLE( LWKOPT )
+         RETURN
+      END IF
+*
+*     ===================================================================
+*
+      IF( MAXC2NRM.EQ.ZERO ) THEN
+*
+*        Check is the matrix A is a zero matrix, set array TAU and
+*        return from the routine.
+*
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+*
+         DO J = 1, MINMN
+            TAU( J ) = ZERO
+         END DO
+*
+         WORK( 1 ) = DBLE( LWKOPT )
+         RETURN
+*
+      END IF
+*
+*     ===================================================================
+*
+      HUGEVAL = DLAMCH( 'Overflow' )
+*
+      IF( MAXC2NRM.GT.HUGEVAL ) THEN
+*
+*        Check if the matrix A contains +Inf or -Inf, set INFO parameter
+*        to the column number, where the first +/-Inf  is found plus N,
+*        and continue the computation.
+*
+         INFO = N + KP1
+*
+      END IF
+*
+*     ==================================================================
+*
+*     Quick return if possible for the case when the first
+*     stopping criterion is satisfied, i.e. KMAX = 0.
+*
+      IF( KMAX.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+         DO J = 1, MINMN
+            TAU( J ) = ZERO
+         END DO
+         WORK( 1 ) = DBLE( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+      EPS = DLAMCH('Epsilon')
+*
+*     Adjust ABSTOL
+*
+      IF( ABSTOL.GE.ZERO ) THEN
+         SAFMIN = DLAMCH('Safe minimum')
+         ABSTOL = MAX( ABSTOL, TWO*SAFMIN )
+      END IF
+*
+*     Adjust RELTOL
+*
+      IF( RELTOL.GE.ZERO ) THEN
+         RELTOL = MAX( RELTOL, EPS )
+      END IF
+*
+*     ===================================================================
+*
+*     JMAX is the maximum index of the column to be factorized,
+*     which is also limited by the first stopping criterion KMAX.
+*
+      JMAX = MIN( KMAX, MINMN )
+*
+*     ===================================================================
+*
+*     Quick return if possible for the case when the second or third
+*     stopping criterion for the whole original matrix is satified,
+*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL
+*     (which is ONE <= RELTOL).
+*
+      IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN
+*
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+*
+         DO J = 1, MINMN
+            TAU( J ) = ZERO
+         END DO
+*
+         WORK( 1 ) = DBLE( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*     Factorize columns
+*     ==================================================================
+*
+*     Determine the block size.
+*
+      NBMIN = 2
+      NX = 0
+*
+      IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN
+*
+*        Determine when to cross over from blocked to unblocked code.
+*        (for N less than NX, unblocked code should be used).
+*
+         NX = MAX( 0, ILAENV( IXOVER, 'DGEQP3RK', ' ', M, N, -1, -1 ))
+*
+         IF( NX.LT.MINMN ) THEN
+*
+*           Determine if workspace is large enough for blocked code.
+*
+            IF( LWORK.LT.LWKOPT ) THEN
+*
+*              Not enough workspace to use optimal block size that
+*              is currently stored in NB.
+*              Reduce NB and determine the minimum value of NB.
+*
+               NB = ( LWORK-2*N ) / ( N+1 )
+               NBMIN = MAX( 2, ILAENV( INBMIN, 'DGEQP3RK', ' ', M, N,
+     $                 -1, -1 ) )
+*
+            END IF
+         END IF
+      END IF
+*
+*     ==================================================================
+*
+*     DONE is the boolean flag to rerpresent the case when the
+*     factorization completed in the block factorization routine,
+*     before the end of the block.
+*
+      DONE = .FALSE.
+*
+*     J is the column index.
+*
+      J = 1
+*
+*     (1) Use blocked code initially.
+*
+*     JMAXB is the maximum column index of the block, when the
+*     blocked code is used, is also limited by the first stopping
+*     criterion KMAX.
+*
+      JMAXB = MIN( KMAX, MINMN - NX )
+*
+      IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN
+*
+*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here:
+*        J   is the column index of a column block;
+*        JB  is the column block size to pass to block factorization
+*            routine in a loop step;
+*        JBF is the number of columns that were actually factorized
+*            that was returned by the block factorization routine
+*            in a loop step, JBF <= JB;
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         DO WHILE( J.LE.JMAXB )
+*
+            JB = MIN( NB, JMAXB-J+1 )
+            N_SUB = N-J+1
+            IOFFSET = J-1
+*
+*           Factorize JB columns among the columns A(J:N).
+*
+            CALL DLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                     DONE, JBF, MAXC2NRMK, RELMAXC2NRMK,
+     $                     JPIV( J ), TAU( J ),
+     $                     WORK( J ), WORK( N+J ),
+     $                     WORK( 2*N+1 ), WORK( 2*N+JB+1 ),
+     $                     N+NRHS-J+1, IWORK, IINFO )
+*
+*           Set INFO on the first occurence of Inf.
+*
+            IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+               INFO = 2*IOFFSET + IINFO
+            END IF
+*
+            IF( DONE ) THEN
+*
+*              Either the submatrix is zero before the end of the
+*              column block, or ABSTOL or RELTOL criterion is
+*              satisfied before the end of the column block, we can
+*              return from the routine. Perform the following before
+*              returning:
+*                a) Set the number of factorized columns K,
+*                   K = IOFFSET + JBF from the last call of blocked
+*                   routine.
+*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned
+*                         by the block factorization routine;
+*                      2) The remaining TAUs are set to ZERO by the
+*                         block factorization routine.
+*
+               K = IOFFSET + JBF
+*
+*              Set INFO on the first occurrence of NaN, NaN takes
+*              prcedence over Inf.
+*
+               IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+                  INFO = IOFFSET + IINFO
+               END IF
+*
+*              Return from the routine.
+*
+               WORK( 1 ) = DBLE( LWKOPT )
+*
+               RETURN
+*
+            END IF
+*
+            J = J + JBF
+*
+         END DO
+*
+      END IF
+*
+*     Use unblocked code to factor the last or only block.
+*     J = JMAX+1 means we factorized the maximum possible number of
+*     columns, that is in ELSE clause we need to compute
+*     the MAXC2NORM and RELMAXC2NORM to return after we processed
+*     the blocks.
+*
+      IF( J.LE.JMAX ) THEN
+*
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         N_SUB = N-J+1
+         IOFFSET = J-1
+*
+         CALL DLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1,
+     $                  ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                  KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ),
+     $                  TAU( J ), WORK( J ), WORK( N+J ),
+     $                  WORK( 2*N+1 ), IINFO )
+*
+*        ABSTOL or RELTOL criterion is satisfied when the number of
+*        the factorized columns KF is smaller then the  number
+*        of columns JMAX-J+1 supplied to be factorized by the
+*        unblocked routine, we can return from
+*        the routine. Perform the following before returning:
+*           a) Set the number of factorized columns K,
+*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the
+*              unblocked factorization routine above.
+*
+         K = J - 1 + KF
+*
+*        Set INFO on the first exception occurence.
+*
+*        Set INFO on the first exception occurence of Inf or NaN,
+*        (NaN takes precedence over Inf).
+*
+         IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+            INFO = 2*IOFFSET + IINFO
+         ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+            INFO = IOFFSET + IINFO
+         END IF
+*
+      ELSE
+*
+*        Compute the return values for blocked code.
+*
+*        Set the number of factorized columns if the unblocked routine
+*        was not called.
+*
+            K = JMAX
+*
+*        If there exits a residual matrix after the blocked code:
+*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the
+*              residual matrix, otherwise set them to ZERO;
+*           2) Set TAU(K+1:MINMN) to ZERO.
+*
+         IF( K.LT.MINMN ) THEN
+            JMAXC2NRM = K + IDAMAX( N-K, WORK( K+1 ), 1 )
+            MAXC2NRMK = WORK( JMAXC2NRM )
+            IF( K.EQ.0 ) THEN
+               RELMAXC2NRMK = ONE
+            ELSE
+               RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+            END IF
+*
+            DO J = K + 1, MINMN
+               TAU( J ) = ZERO
+            END DO
+*
+         END IF
+*
+*     END IF( J.LE.JMAX ) THEN
+*
+      END IF
+*
+      WORK( 1 ) = DBLE( LWKOPT )
+*
+      RETURN
+*
+*     End of DGEQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/dgeqr.f b/lapack-netlib/SRC/dgeqr.f
index eac8930ce..6ed8f211f 100644
--- a/lapack-netlib/SRC/dgeqr.f
+++ b/lapack-netlib/SRC/dgeqr.f
@@ -99,7 +99,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -168,6 +168,8 @@
 *>
 *> \endverbatim
 *>
+*> \ingroup geqr
+*>
 *  =====================================================================
       SUBROUTINE DGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
@@ -188,7 +190,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, LMINWS, MINT, MINW
-      INTEGER            MB, NB, MINTSZ, NBLCKS
+      INTEGER            MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -244,8 +246,10 @@
 *
 *     Determine if the workspace size satisfies minimal size
 *
+      LWMIN = MAX( 1, N )
+      LWREQ = MAX( 1, N*NB )
       LMINWS = .FALSE.
-      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N )
+      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ )
      $    .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ )
      $    .AND. ( .NOT.LQUERY ) ) THEN
         IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN
@@ -253,7 +257,7 @@
           NB = 1
           MB = M
         END IF
-        IF( LWORK.LT.NB*N ) THEN
+        IF( LWORK.LT.LWREQ ) THEN
           LMINWS = .TRUE.
           NB = 1
         END IF
@@ -268,7 +272,7 @@
       ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 )
      $   .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -6
-      ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY )
+      ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY )
      $   .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -8
       END IF
@@ -282,9 +286,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = MAX( 1, N )
+          WORK( 1 ) = LWMIN
         ELSE
-          WORK( 1 ) = MAX( 1, NB*N )
+          WORK( 1 ) = LWREQ
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -309,7 +313,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = MAX( 1, NB*N )
+      WORK( 1 ) = LWREQ
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/dgeqrfp.f b/lapack-netlib/SRC/dgeqrfp.f
index 46d2ee479..aa757e96c 100644
--- a/lapack-netlib/SRC/dgeqrfp.f
+++ b/lapack-netlib/SRC/dgeqrfp.f
@@ -97,7 +97,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is
 *>          the optimal blocksize.
 *>
@@ -122,7 +123,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup geqrfp
 *
 *> \par Further Details:
 *  =====================
@@ -162,8 +163,8 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB,
-     $                   NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DGEQR2P, DLARFB, DLARFT, XERBLA
@@ -181,8 +182,16 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'DGEQRF', ' ', M, N, -1, -1 )
-      LWKOPT = N*NB
+      K = MIN( M, N )
+      IF( K.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = N
+         LWKOPT = N*NB
+      END IF
       WORK( 1 ) = LWKOPT
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -190,7 +199,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
@@ -202,7 +211,6 @@
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -210,7 +218,7 @@
 *
       NBMIN = 2
       NX = 0
-      IWS = N
+      IWS = LWKMIN
       IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
 *        Determine when to cross over from blocked to unblocked code.
diff --git a/lapack-netlib/SRC/dgerqf.f b/lapack-netlib/SRC/dgerqf.f
index cca9d6367..435239cc7 100644
--- a/lapack-netlib/SRC/dgerqf.f
+++ b/lapack-netlib/SRC/dgerqf.f
@@ -114,7 +114,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gerqf
 *
 *> \par Further Details:
 *  =====================
@@ -189,7 +189,7 @@
          END IF
          WORK( 1 ) = LWKOPT
 *
-         IF ( .NOT.LQUERY ) THEN
+         IF( .NOT.LQUERY ) THEN
             IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
      $         INFO = -7
          END IF
diff --git a/lapack-netlib/SRC/dgesvj.f b/lapack-netlib/SRC/dgesvj.f
index 5fdb21e45..198bfb0a5 100644
--- a/lapack-netlib/SRC/dgesvj.f
+++ b/lapack-netlib/SRC/dgesvj.f
@@ -208,7 +208,7 @@
 *>
 *> \param[in,out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (LWORK)
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On entry :
 *>          If JOBU = 'C' :
 *>          WORK(1) = CTOL, where CTOL defines the threshold for convergence.
@@ -239,7 +239,12 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          length of WORK, WORK >= MAX(6,M+N)
+*>          The length of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(6,M+N), otherwise.
+*>
+*>          If on entry LWORK = -1, then a workspace query is assumed and
+*>          no computation is done; WORK(1) is set to the minial (and optimal)
+*>          length of WORK.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -260,7 +265,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gesvj
 *
 *> \par Further Details:
 *  =====================
@@ -365,9 +370,9 @@
       INTEGER            BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1,
      $                   ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34,
      $                   N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP,
-     $                   SWBAND
-      LOGICAL            APPLV, GOSCALE, LOWER, LSVEC, NOSCALE, ROTOK,
-     $                   RSVEC, UCTOL, UPPER
+     $                   SWBAND, MINMN, LWMIN
+      LOGICAL            APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE,
+     $                   ROTOK, RSVEC, UCTOL, UPPER
 *     ..
 *     .. Local Arrays ..
       DOUBLE PRECISION   FASTR( 5 )
@@ -408,6 +413,14 @@
       UPPER = LSAME( JOBA, 'U' )
       LOWER = LSAME( JOBA, 'L' )
 *
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 6, M+N )
+      END IF
+*
+      LQUERY = ( LWORK.EQ.-1 )
       IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN
          INFO = -1
       ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN
@@ -427,7 +440,7 @@
          INFO = -11
       ELSE IF( UCTOL .AND. ( WORK( 1 ).LE.ONE ) ) THEN
          INFO = -12
-      ELSE IF( LWORK.LT.MAX( M+N, 6 ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN
          INFO = -13
       ELSE
          INFO = 0
@@ -437,11 +450,14 @@
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DGESVJ', -INFO )
          RETURN
+      ELSE IF( LQUERY ) THEN
+         WORK( 1 ) = LWMIN
+         RETURN
       END IF
 *
 * #:) Quick return for void matrix
 *
-      IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN
+      IF( MINMN.EQ.0 ) RETURN
 *
 *     Set numerical parameters
 *     The stopping criterion for Jacobi rotations is
diff --git a/lapack-netlib/SRC/dgetri.f b/lapack-netlib/SRC/dgetri.f
index 92ef90c18..7b5a3a1b6 100644
--- a/lapack-netlib/SRC/dgetri.f
+++ b/lapack-netlib/SRC/dgetri.f
@@ -107,7 +107,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup getri
 *
 *  =====================================================================
       SUBROUTINE DGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -151,8 +151,9 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'DGETRI', ' ', N, -1, -1, -1 )
-      LWKOPT = N*NB
+      LWKOPT = MAX( 1, N*NB )
       WORK( 1 ) = LWKOPT
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
diff --git a/lapack-netlib/SRC/dgetsls.f b/lapack-netlib/SRC/dgetsls.f
index 25f4c12c2..73b505ff7 100644
--- a/lapack-netlib/SRC/dgetsls.f
+++ b/lapack-netlib/SRC/dgetsls.f
@@ -127,7 +127,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed.
 *>          If LWORK = -1, the routine calculates optimal size of WORK for the
 *>          optimal performance and returns this value in WORK(1).
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEsolve
+*> \ingroup getsls
 *
 *  =====================================================================
       SUBROUTINE DGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB,
@@ -189,7 +189,7 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       DOUBLE PRECISION   DLAMCH, DLANGE
-      EXTERNAL           LSAME, DLABAD, DLAMCH, DLANGE
+      EXTERNAL           LSAME, DLAMCH, DLANGE
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DGEQR, DGEMQR, DLASCL, DLASET,
@@ -226,7 +226,10 @@
 *
 *     Determine the optimum and minimum LWORK
 *
-       IF( M.GE.N ) THEN
+       IF( MIN( M, N, NRHS ).EQ.0 ) THEN
+         WSIZEM = 1
+         WSIZEO = 1
+       ELSE IF( M.GE.N ) THEN
          CALL DGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 )
          TSZO = INT( TQ( 1 ) )
          LWO  = INT( WORKQ( 1 ) )
@@ -294,7 +297,6 @@
 *
        SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' )
        BIGNUM = ONE / SMLNUM
-       CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A, B if max element outside range [SMLNUM,BIGNUM]
 *
diff --git a/lapack-netlib/SRC/dgetsqrhrt.f b/lapack-netlib/SRC/dgetsqrhrt.f
index 668deeba8..682c7c30f 100644
--- a/lapack-netlib/SRC/dgetsqrhrt.f
+++ b/lapack-netlib/SRC/dgetsqrhrt.f
@@ -130,14 +130,17 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          If MIN(M,N) = 0, LWORK >= 1, else
+*>          LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
-*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ).
+*>
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
@@ -160,7 +163,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup getsqrhrt
 *
 *> \par Contributors:
 *  ==================
@@ -212,7 +215,7 @@
 *     Test the input arguments
 *
       INFO = 0
-      LQUERY  = LWORK.EQ.-1
+      LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
       ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
@@ -225,7 +228,7 @@
          INFO = -5
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -7
-      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN
          INFO = -9
       ELSE
 *
@@ -263,8 +266,9 @@
             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
             LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+            LWORKOPT = MAX( 1, LWORKOPT )
 *
-            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+            IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN
                INFO = -11
             END IF
 *
@@ -346,4 +350,4 @@
 *
 *     End of DGETSQRHRT
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/SRC/dgges.f b/lapack-netlib/SRC/dgges.f
index 31db23715..b9ffc7982 100644
--- a/lapack-netlib/SRC/dgges.f
+++ b/lapack-netlib/SRC/dgges.f
@@ -234,8 +234,8 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          If N = 0, LWORK >= 1, else LWORK >= 8*N+16.
-*>          For good performance , LWORK must generally be larger.
+*>          If N = 0, LWORK >= 1, else LWORK >= MAX(8*N,6*N+16).
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -275,7 +275,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEeigen
+*> \ingroup gges
 *
 *  =====================================================================
       SUBROUTINE DGGES( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, LDB,
@@ -321,9 +321,8 @@
       DOUBLE PRECISION   DIF( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DGEQRF, DGGBAK, DGGBAL, DGGHRD, DHGEQZ, DLABAD,
-     $                   DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGSEN,
-     $                   XERBLA
+      EXTERNAL           DGEQRF, DGGBAK, DGGBAL, DGGHRD, DHGEQZ, DLACPY,
+     $                   DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -431,7 +430,6 @@
       EPS = DLAMCH( 'P' )
       SAFMIN = DLAMCH( 'S' )
       SAFMAX = ONE / SAFMIN
-      CALL DLABAD( SAFMIN, SAFMAX )
       SMLNUM = SQRT( SAFMIN ) / EPS
       BIGNUM = ONE / SMLNUM
 *
diff --git a/lapack-netlib/SRC/dgges3.f b/lapack-netlib/SRC/dgges3.f
index 7b00d294a..2ef55951a 100644
--- a/lapack-netlib/SRC/dgges3.f
+++ b/lapack-netlib/SRC/dgges3.f
@@ -234,6 +234,8 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If N = 0, LWORK >= 1, else LWORK >= 6*N+16.
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -273,7 +275,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEeigen
+*> \ingroup gges3
 *
 *  =====================================================================
       SUBROUTINE DGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B,
@@ -309,7 +311,8 @@
       LOGICAL            CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL,
      $                   LQUERY, LST2SL, WANTST
       INTEGER            I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT,
-     $                   ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT
+     $                   ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT,
+     $                   LWKMIN
       DOUBLE PRECISION   ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL,
      $                   PVSR, SAFMAX, SAFMIN, SMLNUM
 *     ..
@@ -318,9 +321,8 @@
       DOUBLE PRECISION   DIF( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLABAD,
-     $                   DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGSEN,
-     $                   XERBLA
+      EXTERNAL           DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLACPY,
+     $                   DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -362,6 +364,12 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      IF( N.EQ.0 ) THEN
+         LWKMIN = 1
+      ELSE
+         LWKMIN = 6*N+16
+      END IF
+*
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -378,7 +386,7 @@
          INFO = -15
       ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN
          INFO = -17
-      ELSE IF( LWORK.LT.6*N+16 .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -19
       END IF
 *
@@ -386,29 +394,33 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL DGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( 6*N+16, 3*N+INT( WORK ( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) )
          CALL DORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
-         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          IF( ILVSL ) THEN
             CALL DORGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, IERR )
-            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          END IF
          CALL DGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL,
      $                LDVSL, VSR, LDVSR, WORK, -1, IERR )
-         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          CALL DLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB,
      $                ALPHAR, ALPHAI, BETA, VSL, LDVSL, VSR, LDVSR,
      $                WORK, -1, 0, IERR )
-         LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
          IF( WANTST ) THEN
             CALL DTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB,
      $                   ALPHAR, ALPHAI, BETA, VSL, LDVSL, VSR, LDVSR,
      $                   SDIM, PVSL, PVSR, DIF, WORK, -1, IDUM, 1,
      $                   IERR )
-            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
+         END IF
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = LWKOPT
          END IF
-         WORK( 1 ) = LWKOPT
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -430,7 +442,6 @@
       EPS = DLAMCH( 'P' )
       SAFMIN = DLAMCH( 'S' )
       SAFMAX = ONE / SAFMIN
-      CALL DLABAD( SAFMIN, SAFMAX )
       SMLNUM = SQRT( SAFMIN ) / EPS
       BIGNUM = ONE / SMLNUM
 *
diff --git a/lapack-netlib/SRC/dggev3.f b/lapack-netlib/SRC/dggev3.f
index 4bbe8a40f..b970c04c4 100644
--- a/lapack-netlib/SRC/dggev3.f
+++ b/lapack-netlib/SRC/dggev3.f
@@ -188,7 +188,9 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
-*>          LWORK is INTEGER
+*>          LWORK is INTEGER.
+*>          The dimension of the array WORK. LWORK >= MAX(1,8*N).
+*>          For good performance, LWORK should generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -217,7 +219,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEeigen
+*> \ingroup ggev3
 *
 *  =====================================================================
       SUBROUTINE DGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHAR,
@@ -248,7 +250,8 @@
       LOGICAL            ILASCL, ILBSCL, ILV, ILVL, ILVR, LQUERY
       CHARACTER          CHTEMP
       INTEGER            ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO,
-     $                   IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT
+     $                   IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT,
+     $                   LWKMIN
       DOUBLE PRECISION   ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS,
      $                   SMLNUM, TEMP
 *     ..
@@ -256,9 +259,8 @@
       LOGICAL            LDUMMA( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLABAD,
-     $                   DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGEVC,
-     $                   XERBLA
+      EXTERNAL           DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLACPY,
+     $                   DLASCL, DLASET, DORGQR, DORMQR, DTGEVC, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -299,6 +301,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 8*N )
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -313,7 +316,7 @@
          INFO = -12
       ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN
          INFO = -14
-      ELSE IF( LWORK.LT.MAX( 1, 8*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -16
       END IF
 *
@@ -321,13 +324,13 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL DGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX(1, 8*N, 3*N+INT( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) )
          CALL DORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, -1,
      $                IERR )
-         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          IF( ILVL ) THEN
             CALL DORGQR( N, N, N, VL, LDVL, WORK, WORK, -1, IERR )
-            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          END IF
          IF( ILV ) THEN
             CALL DGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL,
@@ -336,18 +339,21 @@
             CALL DLAQZ0( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
      $                   ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR,
      $                   WORK, -1, 0, IERR )
-            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
          ELSE
             CALL DGGHD3( 'N', 'N', N, 1, N, A, LDA, B, LDB, VL, LDVL,
      $                   VR, LDVR, WORK, -1, IERR )
-            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
             CALL DLAQZ0( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
      $                   ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR,
      $                   WORK, -1, 0, IERR )
-            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
+         END IF
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = LWKOPT
          END IF
-
-         WORK( 1 ) = LWKOPT
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -367,7 +373,6 @@
       EPS = DLAMCH( 'P' )
       SMLNUM = DLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
diff --git a/lapack-netlib/SRC/dgghd3.f b/lapack-netlib/SRC/dgghd3.f
index 43d7a77df..21a668573 100644
--- a/lapack-netlib/SRC/dgghd3.f
+++ b/lapack-netlib/SRC/dgghd3.f
@@ -179,14 +179,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (LWORK)
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
-*> \param[in]  LWORK
+*> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= 1.
+*>          The length of the array WORK. LWORK >= 1.
 *>          For optimum performance LWORK >= 6*N*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -211,7 +211,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup gghd3
 *
 *> \par Further Details:
 *  =====================
@@ -275,7 +275,12 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'DGGHD3', ' ', N, ILO, IHI, -1 )
-      LWKOPT = MAX( 6*N*NB, 1 )
+      NH = IHI - ILO + 1
+      IF( NH.LE.1 ) THEN
+         LWKOPT = 1
+      ELSE
+         LWKOPT = 6*N*NB
+      END IF
       WORK( 1 ) = DBLE( LWKOPT )
       INITQ = LSAME( COMPQ, 'I' )
       WANTQ = INITQ .OR. LSAME( COMPQ, 'V' )
@@ -325,7 +330,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = ONE
          RETURN
@@ -885,6 +889,7 @@
       IF ( JCOL.LT.IHI )
      $   CALL DGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q,
      $                LDQ, Z, LDZ, IERR )
+*
       WORK( 1 ) = DBLE( LWKOPT )
 *
       RETURN
diff --git a/lapack-netlib/SRC/dggqrf.f b/lapack-netlib/SRC/dggqrf.f
index 39d27a5c9..edac7f22f 100644
--- a/lapack-netlib/SRC/dggqrf.f
+++ b/lapack-netlib/SRC/dggqrf.f
@@ -173,7 +173,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup ggqrf
 *
 *> \par Further Details:
 *  =====================
@@ -250,7 +250,7 @@
       NB2 = ILAENV( 1, 'DGERQF', ' ', N, P, -1, -1 )
       NB3 = ILAENV( 1, 'DORMQR', ' ', N, M, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P )*NB
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
@@ -287,6 +287,7 @@
 *     RQ factorization of N-by-P matrix B: B = T*Z.
 *
       CALL DGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO )
+*
       WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) )
 *
       RETURN
diff --git a/lapack-netlib/SRC/dggrqf.f b/lapack-netlib/SRC/dggrqf.f
index ddf4104c5..3b1024c1c 100644
--- a/lapack-netlib/SRC/dggrqf.f
+++ b/lapack-netlib/SRC/dggrqf.f
@@ -172,7 +172,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup ggrqf
 *
 *> \par Further Details:
 *  =====================
@@ -249,7 +249,7 @@
       NB2 = ILAENV( 1, 'DGEQRF', ' ', P, N, -1, -1 )
       NB3 = ILAENV( 1, 'DORMRQ', ' ', M, N, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P )*NB
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/dggsvd3.f b/lapack-netlib/SRC/dggsvd3.f
index 503f0d8cc..ee4d11e86 100644
--- a/lapack-netlib/SRC/dggsvd3.f
+++ b/lapack-netlib/SRC/dggsvd3.f
@@ -278,7 +278,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -328,7 +328,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEsing
+*> \ingroup ggsvd3
 *
 *> \par Contributors:
 *  ==================
diff --git a/lapack-netlib/SRC/dggsvp3.f b/lapack-netlib/SRC/dggsvp3.f
index 4e1db3117..485d95b36 100644
--- a/lapack-netlib/SRC/dggsvp3.f
+++ b/lapack-netlib/SRC/dggsvp3.f
@@ -227,7 +227,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -250,7 +250,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup ggsvp3
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/dlamswlq.f b/lapack-netlib/SRC/dlamswlq.f
index 70e78f4b1..07ef1bd57 100644
--- a/lapack-netlib/SRC/dlamswlq.f
+++ b/lapack-netlib/SRC/dlamswlq.f
@@ -127,17 +127,20 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          If SIDE = 'L', LWORK >= max(1,NB) * MB;
-*>          if SIDE = 'R', LWORK >= max(1,M) * MB.
+*>
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,NB*MB).
+*>          If SIDE = 'R', LWORK >= max(1,M*MB).
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -189,29 +192,31 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamswlq
+*>
 *  =====================================================================
       SUBROUTINE DLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $    LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION A( LDA, * ), WORK( * ), C(LDC, * ),
-     $      T( LDT, * )
+      DOUBLE PRECISION   A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, CTR, LW
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, CTR, LW, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -223,52 +228,60 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * MB
       ELSE
         LW = M * MB
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
 *
       INFO = 0
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( K.LT.0 ) THEN
         INFO = -5
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
         INFO = -4
-      ELSE IF( K.LT.MB .OR. MB.LT.1) THEN
+      ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN
         INFO = -6
       ELSE IF( LDA.LT.MAX( 1, K ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, MB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = LWMIN
+      END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'DLAMSWLQ', -INFO )
-        WORK(1) = LW
         RETURN
-      ELSE IF (LQUERY) THEN
-        WORK(1) = LW
+      ELSE IF( LQUERY ) THEN
         RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -402,7 +415,8 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = LWMIN
+*
       RETURN
 *
 *     End of DLAMSWLQ
diff --git a/lapack-netlib/SRC/dlamtsqr.f b/lapack-netlib/SRC/dlamtsqr.f
index 962a31476..023db5ac9 100644
--- a/lapack-netlib/SRC/dlamtsqr.f
+++ b/lapack-netlib/SRC/dlamtsqr.f
@@ -128,22 +128,24 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,N*NB).
+*>          If SIDE = 'R', LWORK >= max(1,MB*NB).
 *>
-*>          If SIDE = 'L', LWORK >= max(1,N)*NB;
-*>          if SIDE = 'R', LWORK >= max(1,MB)*NB.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -191,29 +193,31 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamtsqr
+*>
 *  =====================================================================
       SUBROUTINE DLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $        LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION A( LDA, * ), WORK( * ), C(LDC, * ),
-     $                T( LDT, * )
+      DOUBLE PRECISION   A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR, Q
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, Q, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -225,12 +229,13 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      INFO = 0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * NB
         Q = M
       ELSE
@@ -238,11 +243,17 @@
         Q = N
       END IF
 *
-      INFO = 0
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
+*
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
@@ -253,38 +264,38 @@
         INFO = -7
       ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, NB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
-*     Determine the block size if it is tall skinny or short and wide
-*
-      IF( INFO.EQ.0)  THEN
-          WORK(1) = LW
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'DLAMTSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
+*
+*     Determine the block size if it is tall skinny or short and wide
 *
       IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN
         CALL DGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA,
-     $        T, LDT, C, LDC, WORK, INFO)
+     $        T, LDT, C, LDC, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
       IF(LEFT.AND.NOTRAN) THEN
 *
@@ -410,7 +421,8 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = LWMIN
+*
       RETURN
 *
 *     End of DLAMTSQR
diff --git a/lapack-netlib/SRC/dlaqp2rk.c b/lapack-netlib/SRC/dlaqp2rk.c
new file mode 100644
index 000000000..de216ad97
--- /dev/null
+++ b/lapack-netlib/SRC/dlaqp2rk.c
@@ -0,0 +1,923 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+
+/* Subroutine */ int dlaqp2rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *kmax, doublereal *abstol, doublereal *reltol, 
+	integer *kp1, doublereal *maxc2nrm, doublereal *a, integer *lda, 
+	integer *k, doublereal *maxc2nrmk, doublereal *relmaxc2nrmk, integer *
+	jpiv, doublereal *tau, doublereal *vn1, doublereal *vn2, doublereal *
+	work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    doublereal aikk, temp;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    doublereal temp2;
+    integer i__, j;
+    doublereal tol3z;
+    integer jmaxc2nrm;
+    extern /* Subroutine */ int dlarf_(char *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *);
+    integer itemp;
+    extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, 
+	    doublereal *, integer *);
+    integer minmnfact;
+    doublereal myhugeval;
+    integer minmnupdt, kk;
+    extern doublereal dlamch_(char *);
+    integer kp;
+    extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *,
+	     integer *, doublereal *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern logical disnan_(doublereal *);
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/*     MINMNUPDT is the smallest dimension */
+/*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which */
+/*     contains the submatrices A(IOFFSET+1:M,1:N) and */
+/*     B(IOFFSET+1:M,1:NRHS) as column blocks. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *kmax = f2cmin(*kmax,minmnfact);
+    tol3z = sqrt(dlamch_("Epsilon"));
+    myhugeval = dlamch_("Overflow");
+
+/*     Compute the factorization, KK is the lomn loop index. */
+
+    i__1 = *kmax;
+    for (kk = 1; kk <= i__1; ++kk) {
+
+	i__ = *ioffset + kk;
+
+	if (i__ == 1) {
+
+/*           ============================================================ */
+
+/*           We are at the first column of the original whole matrix A, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+/*           ============================================================ */
+
+	} else {
+
+/*           ============================================================ */
+
+/*           Determine the pivot column in KK-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__2 = *n - kk + 1;
+	    kp = kk - 1 + idamax_(&i__2, &vn1[kk], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,KK:N) in step KK. */
+/*           RELMAXC2NRMK  will be computed later, after somecondition */
+/*           checks on MAXC2NRMK. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains NaN, and set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (disnan_(maxc2nrmk)) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*info = *k + kp;
+
+/*               Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*               Array TAU(K+1:MINMNFACT) is not set and contains */
+/*               undefined elements. */
+
+		return 0;
+	    }
+
+/*           ============================================================ */
+
+/*           Quick return, if the submatrix A(I:M,KK:N) is */
+/*           a zero matrix. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*maxc2nrmk == 0.) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*relmaxc2nrmk = 0.;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    tau[j] = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + kk - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL >= ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+/*              Set K, the number of factorized columns. */
+
+		*k = kk - 1;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    tau[j] = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,KK:N): */
+/*        1) swap the KK-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) copy the KK-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than KK in the next loop step.) */
+/*        3) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A, not the block A(1:M,1:N). */
+
+	if (kp != kk) {
+	    dswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1);
+	    vn1[kp] = vn1[kk];
+	    vn2[kp] = vn2[kk];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[kk];
+	    jpiv[kk] = itemp;
+	}
+
+/*        Generate elementary reflector H(KK) using the column A(I:M,KK), */
+/*        if the column has more than one element, otherwise */
+/*        the elementary reflector would be an identity matrix, */
+/*        and TAU(KK) = ZERO. */
+
+	if (i__ < *m) {
+	    i__2 = *m - i__ + 1;
+	    dlarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], &
+		    c__1, &tau[kk]);
+	} else {
+	    tau[kk] = 0.;
+	}
+
+/*        Check if TAU(KK) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(KK) for Inf, */
+/*        since DLARFG cannot produce TAU(KK) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by DLARFG can contain Inf, which requires */
+/*        TAU(KK) to contain NaN. Therefore, this case of generating Inf */
+/*        by DLARFG is covered by checking TAU(KK) for NaN. */
+
+	if (disnan_(&tau[kk])) {
+	    *k = kk - 1;
+	    *info = kk;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = tau[kk];
+	    *relmaxc2nrmk = tau[kk];
+
+/*           Array TAU(KK:MINMNFACT) is not set and contains */
+/*           undefined elements, except the first element TAU(KK) = NaN. */
+
+	    return 0;
+	}
+
+/*        Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left. */
+/*        ( If M >= N, then at KK = N there is no residual matrix, */
+/*         i.e. no columns of A to update, only columns of B. */
+/*         If M < N, then at KK = M-IOFFSET, I = M and we have a */
+/*         one-row residual matrix in A and the elementary */
+/*         reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update */
+/*         is needed for the residual matrix in A and the */
+/*         right-hand-side-matrix in B. */
+/*         Therefore, we update only if */
+/*         KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */
+/*         condition is satisfied, not only KK < N+NRHS ) */
+
+	if (kk < minmnupdt) {
+	    aikk = a[i__ + kk * a_dim1];
+	    a[i__ + kk * a_dim1] = 1.;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n + *nrhs - kk;
+	    dlarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &tau[
+		    kk], &a[i__ + (kk + 1) * a_dim1], lda, &work[1]);
+	    a[i__ + kk * a_dim1] = aikk;
+	}
+
+	if (kk < minmnfact) {
+
+/*           Update the partial column 2-norms for the residual matrix, */
+/*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */
+/*           when KK < f2cmin(M-IOFFSET, N). */
+
+	    i__2 = *n;
+	    for (j = kk + 1; j <= i__2; ++j) {
+		if (vn1[j] != 0.) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+/* Computing 2nd power */
+		    d__2 = (d__1 = a[i__ + j * a_dim1], abs(d__1)) / vn1[j];
+		    temp = 1. - d__2 * d__2;
+		    temp = f2cmax(temp,0.);
+/* Computing 2nd power */
+		    d__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (d__1 * d__1);
+		    if (temp2 <= tol3z) {
+
+/*                    Compute the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by explicitly computing it, */
+/*                    and store it in both partial 2-norm vector VN1 */
+/*                    and exact column 2-norm vector VN2. */
+
+			i__3 = *m - i__;
+			vn1[j] = dnrm2_(&i__3, &a[i__ + 1 + j * a_dim1], &
+				c__1);
+			vn2[j] = vn1[j];
+
+		    } else {
+
+/*                    Update the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by removing one */
+/*                    element A(I,J) and store it in partial */
+/*                    2-norm vector VN1. */
+
+			vn1[j] *= sqrt(temp);
+
+		    }
+		}
+	    }
+
+	}
+
+/*     End factorization loop */
+
+    }
+
+/*     If we reached this point, all colunms have been factorized, */
+/*     i.e. no condition was triggered to exit the routine. */
+/*     Set the number of factorized columns. */
+
+    *k = *kmax;
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */
+/*     we return. */
+
+    if (*k < minmnfact) {
+
+	i__1 = *n - *k;
+	jmaxc2nrm = *k + idamax_(&i__1, &vn1[*k + 1], &c__1);
+	*maxc2nrmk = vn1[jmaxc2nrm];
+
+	if (*k == 0) {
+	    *relmaxc2nrmk = 1.;
+	} else {
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+	}
+
+    } else {
+	*maxc2nrmk = 0.;
+	*relmaxc2nrmk = 0.;
+    }
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, set TAUs corresponding to the columns that were */
+/*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO. */
+
+    i__1 = minmnfact;
+    for (j = *k + 1; j <= i__1; ++j) {
+	tau[j] = 0.;
+    }
+
+    return 0;
+
+/*     End of DLAQP2RK */
+
+} /* dlaqp2rk_ */
+
diff --git a/lapack-netlib/SRC/dlaqp2rk.f b/lapack-netlib/SRC/dlaqp2rk.f
new file mode 100644
index 000000000..aecd6bb69
--- /dev/null
+++ b/lapack-netlib/SRC/dlaqp2rk.f
@@ -0,0 +1,713 @@
+*> \brief \b DLAQP2RK computes truncated QR factorization with column pivoting of a real matrix block using Level 2 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DLAQP2RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlaqp2rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlaqp2rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlaqp2rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE DLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+*     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+*     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+*     $                     INFO )
+*      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+*      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+*      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            JPIV( * )
+*      DOUBLE PRECISION   A( LDA, * ), TAU( * ), VN1( * ), VN2( * ),
+*     $                   WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLAQP2RK computes a truncated (rank K) or full rank Householder QR
+*> factorization with column pivoting of a real matrix
+*> block A(IOFFSET+1:M,1:N) as
+*>
+*>   A * P(K) = Q(K) * R(K).
+*>
+*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N)
+*> is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides matrix block B
+*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**T * B.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M-IOFFSET,N), then this stopping
+*>                criterion is not used, factorize columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>             satisfied on input and the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The second factorization stopping criterion.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The third factorization stopping criterion.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine DGEQP3RK. 1 <= KP1 <= N_orig_mat.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is DOUBLE PRECISION
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine DGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:K) below
+*>             the diagonal together with the array TAU represent
+*>             the orthogonal matrix Q(K) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:K) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,K+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(K)**T.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is DOUBLE PRECISION array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (N-1)
+*>          Used in DLARF subroutine to apply an elementary
+*>          reflector from the left.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp2rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+     $                     INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            JPIV( * )
+      DOUBLE PRECISION   A( LDA, * ), TAU( * ), VN1( * ), VN2( * ),
+     $                   WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT,
+     $                   MINMNUPDT
+      DOUBLE PRECISION   AIKK, HUGEVAL, TEMP, TEMP2, TOL3Z
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLARF, DLARFG, DSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      INTEGER            IDAMAX
+      DOUBLE PRECISION   DLAMCH, DNRM2
+      EXTERNAL           DISNAN, DLAMCH, IDAMAX, DNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+*     MINMNUPDT is the smallest dimension
+*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which
+*     contains the submatrices A(IOFFSET+1:M,1:N) and
+*     B(IOFFSET+1:M,1:NRHS) as column blocks.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      KMAX = MIN( KMAX, MINMNFACT )
+      TOL3Z = SQRT( DLAMCH( 'Epsilon' ) )
+      HUGEVAL = DLAMCH( 'Overflow' )
+*
+*     Compute the factorization, KK is the lomn loop index.
+*
+      DO KK = 1, KMAX
+*
+         I = IOFFSET + KK
+*
+         IF( I.EQ.1 ) THEN
+*
+*           ============================================================
+*
+*           We are at the first column of the original whole matrix A,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+
+            KP = KP1
+*
+*           ============================================================
+*
+         ELSE
+*
+*           ============================================================
+*
+*           Determine the pivot column in KK-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( KK-1 ) + IDAMAX( N-KK+1, VN1( KK ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,KK:N) in step KK.
+*           RELMAXC2NRMK  will be computed later, after somecondition
+*           checks on MAXC2NRMK.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains NaN, and set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( DISNAN( MAXC2NRMK ) ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+                K = KK - 1
+                INFO = K + KP
+*
+*               Set RELMAXC2NRMK to NaN.
+*
+                RELMAXC2NRMK = MAXC2NRMK
+*
+*               Array TAU(K+1:MINMNFACT) is not set and contains
+*               undefined elements.
+*
+               RETURN
+            END IF
+*
+*           ============================================================
+*
+*           Quick return, if the submatrix A(I:M,KK:N) is
+*           a zero matrix.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+               K = KK - 1
+               RELMAXC2NRMK = ZERO
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + KK - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third stopping criteria.
+*           NOTE: There is no need to test for ABSTOL >= ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+*              Set K, the number of factorized columns.
+*
+               K = KK - 1
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,KK:N):
+*        1) swap the KK-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) copy the KK-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than KK in the next loop step.)
+*        3) Save the pivot interchange with the indices relative to the
+*           the original matrix A, not the block A(1:M,1:N).
+*
+         IF( KP.NE.KK ) THEN
+            CALL DSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 )
+            VN1( KP ) = VN1( KK )
+            VN2( KP ) = VN2( KK )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( KK )
+            JPIV( KK ) = ITEMP
+         END IF
+*
+*        Generate elementary reflector H(KK) using the column A(I:M,KK),
+*        if the column has more than one element, otherwise
+*        the elementary reflector would be an identity matrix,
+*        and TAU(KK) = ZERO.
+*
+         IF( I.LT.M ) THEN
+            CALL DLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1,
+     $                   TAU( KK ) )
+         ELSE
+            TAU( KK ) = ZERO
+         END IF
+*
+*        Check if TAU(KK) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(KK) for Inf,
+*        since DLARFG cannot produce TAU(KK) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by DLARFG can contain Inf, which requires
+*        TAU(KK) to contain NaN. Therefore, this case of generating Inf
+*        by DLARFG is covered by checking TAU(KK) for NaN.
+*
+         IF( DISNAN( TAU(KK) ) ) THEN
+            K = KK - 1
+            INFO = KK
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAU( KK )
+            RELMAXC2NRMK = TAU( KK )
+*
+*           Array TAU(KK:MINMNFACT) is not set and contains
+*           undefined elements, except the first element TAU(KK) = NaN.
+*
+            RETURN
+         END IF
+*
+*        Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left.
+*        ( If M >= N, then at KK = N there is no residual matrix,
+*         i.e. no columns of A to update, only columns of B.
+*         If M < N, then at KK = M-IOFFSET, I = M and we have a
+*         one-row residual matrix in A and the elementary
+*         reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update
+*         is needed for the residual matrix in A and the
+*         right-hand-side-matrix in B.
+*         Therefore, we update only if
+*         KK < MINMNUPDT = min(M-IOFFSET, N+NRHS)
+*         condition is satisfied, not only KK < N+NRHS )
+*
+         IF( KK.LT.MINMNUPDT ) THEN
+            AIKK = A( I, KK )
+            A( I, KK ) = ONE
+            CALL DLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1,
+     $                  TAU( KK ), A( I, KK+1 ), LDA, WORK( 1 ) )
+            A( I, KK ) = AIKK
+         END IF
+*
+         IF( KK.LT.MINMNFACT ) THEN
+*
+*           Update the partial column 2-norms for the residual matrix,
+*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e.
+*           when KK < min(M-IOFFSET, N).
+*
+            DO J = KK + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2
+                  TEMP = MAX( TEMP, ZERO )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2 .LE. TOL3Z ) THEN
+*
+*                    Compute the column 2-norm for the partial
+*                    column A(I+1:M,J) by explicitly computing it,
+*                    and store it in both partial 2-norm vector VN1
+*                    and exact column 2-norm vector VN2.
+*
+                     VN1( J ) = DNRM2( M-I, A( I+1, J ), 1 )
+                     VN2( J ) = VN1( J )
+*
+                  ELSE
+*
+*                    Update the column 2-norm for the partial
+*                    column A(I+1:M,J) by removing one
+*                    element A(I,J) and store it in partial
+*                    2-norm vector VN1.
+*
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+*
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*     End factorization loop
+*
+      END DO
+*
+*     If we reached this point, all colunms have been factorized,
+*     i.e. no condition was triggered to exit the routine.
+*     Set the number of factorized columns.
+*
+      K = KMAX
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before
+*     we return.
+*
+      IF( K.LT.MINMNFACT ) THEN
+*
+         JMAXC2NRM = K + IDAMAX( N-K, VN1( K+1 ), 1 )
+         MAXC2NRMK = VN1( JMAXC2NRM )
+*
+         IF( K.EQ.0 ) THEN
+            RELMAXC2NRMK = ONE
+         ELSE
+            RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+         END IF
+*
+      ELSE
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+      END IF
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, set TAUs corresponding to the columns that were
+*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO.
+*
+      DO J = K + 1, MINMNFACT
+         TAU( J ) = ZERO
+      END DO
+*
+      RETURN
+*
+*     End of DLAQP2RK
+*
+      END
diff --git a/lapack-netlib/SRC/dlaqp3rk.c b/lapack-netlib/SRC/dlaqp3rk.c
new file mode 100644
index 000000000..e8c61c257
--- /dev/null
+++ b/lapack-netlib/SRC/dlaqp3rk.c
@@ -0,0 +1,1113 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static doublereal c_b7 = -1.;
+static doublereal c_b8 = 1.;
+static doublereal c_b30 = 0.;
+
+/* Subroutine */ int dlaqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *nb, doublereal *abstol, doublereal *reltol, 
+	integer *kp1, doublereal *maxc2nrm, doublereal *a, integer *lda, 
+	logical *done, integer *kb, doublereal *maxc2nrmk, doublereal *
+	relmaxc2nrmk, integer *jpiv, doublereal *tau, doublereal *vn1, 
+	doublereal *vn2, doublereal *auxv, doublereal *f, integer *ldf, 
+	integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, f_dim1, f_offset, i__1, i__2;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    doublereal temp;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    doublereal temp2;
+    integer i__, j, k;
+    doublereal tol3z;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, 
+	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+	    integer *, doublereal *, doublereal *, integer *),
+	     dgemv_(char *, integer *, integer *, doublereal *, doublereal *, 
+	    integer *, doublereal *, integer *, doublereal *, doublereal *, 
+	    integer *);
+    integer itemp;
+    extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, 
+	    doublereal *, integer *);
+    integer minmnfact;
+    doublereal myhugeval;
+    integer minmnupdt, if__;
+    extern doublereal dlamch_(char *);
+    integer kp;
+    extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *,
+	     integer *, doublereal *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern logical disnan_(doublereal *);
+    integer lsticc;
+    doublereal aik;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --auxv;
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *nb = f2cmin(*nb,minmnfact);
+    tol3z = sqrt(dlamch_("Epsilon"));
+    myhugeval = dlamch_("Overflow");
+
+/*     Compute factorization in a while loop over NB columns, */
+/*     K is the column index in the block A(1:M,1:N). */
+
+    k = 0;
+    lsticc = 0;
+    *done = FALSE_;
+
+    while(k < *nb && lsticc == 0) {
+	++k;
+	i__ = *ioffset + k;
+
+	if (i__ == 1) {
+
+/*           We are at the first column of the original whole matrix A_orig, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+	} else {
+
+/*           Determine the pivot column in K-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__1 = *n - k + 1;
+	    kp = k - 1 + idamax_(&i__1, &vn1[k], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,K:N) in step K. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains NaN, set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (disnan_(maxc2nrmk)) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*info = *kb + kp;
+
+/*              Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix contains NaN and we stop */
+/*              the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    dgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &
+			    c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + 
+			    f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*              undefined elements. */
+
+/*              Return from the routine. */
+
+		return 0;
+	    }
+
+/*           Quick return, if the submatrix A(I:M,K:N) is */
+/*           a zero matrix. We need to check it only if the column index */
+/*           (same as row index) is larger than 1, since the condition */
+/*           for the whole original matrix A_orig is checked in the main */
+/*           routine. */
+
+	    if (*maxc2nrmk == 0.) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*relmaxc2nrmk = 0.;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix is zero and we stop the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */
+
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    dgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &
+			    c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + 
+			    f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    tau[j] = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + k - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third tolerance stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig; */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+
+/*              Apply the block reflector to the residual of the */
+/*              matrix A and the residual of the right hand sides B, if */
+/*              the residual matrix and and/or the residual of the right */
+/*              hand sides exist,  i.e. if the submatrix */
+/*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when */
+/*                 KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */
+
+		if (*kb < minmnupdt) {
+		    i__1 = *m - if__;
+		    i__2 = *n + *nrhs - *kb;
+		    dgemm_("No transpose", "Transpose", &i__1, &i__2, kb, &
+			    c_b7, &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + 
+			    f_dim1], ldf, &c_b8, &a[if__ + 1 + (*kb + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    tau[j] = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,K:N): */
+/*        1) swap the K-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */
+/*        3) copy the K-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than K in the next loop step.) */
+/*        4) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A_orig, not the block A(1:M,1:N). */
+
+	if (kp != k) {
+	    dswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+	    i__1 = k - 1;
+	    dswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf);
+	    vn1[kp] = vn1[k];
+	    vn2[kp] = vn2[k];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[k];
+	    jpiv[k] = itemp;
+	}
+
+/*        Apply previous Householder reflectors to column K: */
+/*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T. */
+
+	if (k > 1) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    dgemv_("No transpose", &i__1, &i__2, &c_b7, &a[i__ + a_dim1], lda,
+		     &f[k + f_dim1], ldf, &c_b8, &a[i__ + k * a_dim1], &c__1);
+	}
+
+/*        Generate elementary reflector H(k) using the column A(I:M,K). */
+
+	if (i__ < *m) {
+	    i__1 = *m - i__ + 1;
+	    dlarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], &
+		    c__1, &tau[k]);
+	} else {
+	    tau[k] = 0.;
+	}
+
+/*        Check if TAU(K) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(K) for Inf, */
+/*        since DLARFG cannot produce TAU(K) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by DLARFG can contain Inf, which requires */
+/*        TAU(K) to contain NaN. Therefore, this case of generating Inf */
+/*        by DLARFG is covered by checking TAU(K) for NaN. */
+
+	if (disnan_(&tau[k])) {
+
+	    *done = TRUE_;
+
+/*           Set KB, the number of factorized partial columns */
+/*                   that are non-zero in each step in the block, */
+/*                   i.e. the rank of the factor R. */
+/*           Set IF, the number of processed rows in the block, which */
+/*                   is the same as the number of processed rows in */
+/*                   the original whole matrix A_orig. */
+
+	    *kb = k - 1;
+	    if__ = i__ - 1;
+	    *info = k;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = tau[k];
+	    *relmaxc2nrmk = tau[k];
+
+/*           There is no need to apply the block reflector to the */
+/*           residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*           since the submatrix contains NaN and we stop */
+/*           the computation. */
+/*           But, we need to apply the block reflector to the residual */
+/*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*           residual right hand sides exist.  This occurs */
+/*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */
+
+	    if (*nrhs > 0 && *kb < *m - *ioffset) {
+		i__1 = *m - if__;
+		dgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &c_b7, &
+			a[if__ + 1 + a_dim1], lda, &f[*n + 1 + f_dim1], ldf, &
+			c_b8, &a[if__ + 1 + (*n + 1) * a_dim1], lda);
+	    }
+
+/*           There is no need to recompute the 2-norm of the */
+/*           difficult columns, since we stop the factorization. */
+
+/*           Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*           undefined elements. */
+
+/*           Return from the routine. */
+
+	    return 0;
+	}
+
+/*        =============================================================== */
+
+	aik = a[i__ + k * a_dim1];
+	a[i__ + k * a_dim1] = 1.;
+
+/*        =============================================================== */
+
+/*        Compute the current K-th column of F: */
+/*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K). */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = *n + *nrhs - k;
+	    dgemv_("Transpose", &i__1, &i__2, &tau[k], &a[i__ + (k + 1) * 
+		    a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b30, &f[k + 
+		    1 + k * f_dim1], &c__1);
+	}
+
+/*           2) Zero out elements above and on the diagonal of the */
+/*              column K in matrix F, i.e elements F(1:K,K). */
+
+	i__1 = k;
+	for (j = 1; j <= i__1; ++j) {
+	    f[j + k * f_dim1] = 0.;
+	}
+
+/*         3) Incremental updating of the K-th column of F: */
+/*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T */
+/*                    * A(I:M,K). */
+
+	if (k > 1) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    d__1 = -tau[k];
+	    dgemv_("Transpose", &i__1, &i__2, &d__1, &a[i__ + a_dim1], lda, &
+		    a[i__ + k * a_dim1], &c__1, &c_b30, &auxv[1], &c__1);
+
+	    i__1 = *n + *nrhs;
+	    i__2 = k - 1;
+	    dgemv_("No transpose", &i__1, &i__2, &c_b8, &f[f_dim1 + 1], ldf, &
+		    auxv[1], &c__1, &c_b8, &f[k * f_dim1 + 1], &c__1);
+	}
+
+/*        =============================================================== */
+
+/*        Update the current I-th row of A: */
+/*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */
+/*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**T. */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *n + *nrhs - k;
+	    dgemv_("No transpose", &i__1, &k, &c_b7, &f[k + 1 + f_dim1], ldf, 
+		    &a[i__ + a_dim1], lda, &c_b8, &a[i__ + (k + 1) * a_dim1], 
+		    lda);
+	}
+
+	a[i__ + k * a_dim1] = aik;
+
+/*        Update the partial column 2-norms for the residual matrix, */
+/*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */
+/*        when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */
+
+	if (k < minmnfact) {
+
+	    i__1 = *n;
+	    for (j = k + 1; j <= i__1; ++j) {
+		if (vn1[j] != 0.) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+		    temp = (d__1 = a[i__ + j * a_dim1], abs(d__1)) / vn1[j];
+/* Computing MAX */
+		    d__1 = 0., d__2 = (temp + 1.) * (1. - temp);
+		    temp = f2cmax(d__1,d__2);
+/* Computing 2nd power */
+		    d__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (d__1 * d__1);
+		    if (temp2 <= tol3z) {
+
+/*                    At J-index, we have a difficult column for the */
+/*                    update of the 2-norm. Save the index of the previous */
+/*                    difficult column in IWORK(J-1). */
+/*                    NOTE: ILSTCC > 1, threfore we can use IWORK only */
+/*                    with N-1 elements, where the elements are */
+/*                    shifted by 1 to the left. */
+
+			iwork[j - 1] = lsticc;
+
+/*                    Set the index of the last difficult column LSTICC. */
+
+			lsticc = j;
+
+		    } else {
+			vn1[j] *= sqrt(temp);
+		    }
+		}
+	    }
+
+	}
+
+/*        End of while loop. */
+
+    }
+
+/*     Now, afler the loop: */
+/*        Set KB, the number of factorized columns in the block; */
+/*        Set IF, the number of processed rows in the block, which */
+/*                is the same as the number of processed rows in */
+/*                the original whole matrix A_orig, IF = IOFFSET + KB. */
+
+    *kb = k;
+    if__ = i__;
+
+/*     Apply the block reflector to the residual of the matrix A */
+/*     and the residual of the right hand sides B, if the residual */
+/*     matrix and and/or the residual of the right hand sides */
+/*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */
+/*     This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */
+
+    if (*kb < minmnupdt) {
+	i__1 = *m - if__;
+	i__2 = *n + *nrhs - *kb;
+	dgemm_("No transpose", "Transpose", &i__1, &i__2, kb, &c_b7, &a[if__ 
+		+ 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b8, &a[if__ 
+		+ 1 + (*kb + 1) * a_dim1], lda);
+    }
+
+/*     Recompute the 2-norm of the difficult columns. */
+/*     Loop over the index of the difficult columns from the largest */
+/*     to the smallest index. */
+
+    while(lsticc > 0) {
+
+/*        LSTICC is the index of the last difficult column is greater */
+/*        than 1. */
+/*        ITEMP is the index of the previous difficult column. */
+
+	itemp = iwork[lsticc - 1];
+
+/*        Compute the 2-norm explicilty for the last difficult column and */
+/*        save it in the partial and exact 2-norm vectors VN1 and VN2. */
+
+/*        NOTE: The computation of VN1( LSTICC ) relies on the fact that */
+/*        DNRM2 does not fail on vectors with norm below the value of */
+/*        SQRT(DLAMCH('S')) */
+
+	i__1 = *m - if__;
+	vn1[lsticc] = dnrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1);
+	vn2[lsticc] = vn1[lsticc];
+
+/*        Downdate the index of the last difficult column to */
+/*        the index of the previous difficult column. */
+
+	lsticc = itemp;
+
+    }
+
+    return 0;
+
+/*     End of DLAQP3RK */
+
+} /* dlaqp3rk_ */
+
diff --git a/lapack-netlib/SRC/dlaqp3rk.f b/lapack-netlib/SRC/dlaqp3rk.f
new file mode 100644
index 000000000..8139345ed
--- /dev/null
+++ b/lapack-netlib/SRC/dlaqp3rk.f
@@ -0,0 +1,935 @@
+*> \brief \b DLAQP3RK computes a step of truncated QR factorization with column pivoting of a real m-by-n matrix A using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DLAQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlaqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlaqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlaqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE DLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+*     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+*     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+*      IMPLICIT NONE
+*      LOGICAL            DONE
+*      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+*     $                   NB, NRHS
+*      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*
+*     .. Scalar Arguments ..
+*      LOGICAL            DONE
+*      INTEGER            KB, LDA, LDF, M, N, NB, NRHS, IOFFSET
+*      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            IWORK( * ), JPIV( * )
+*      DOUBLE PRECISION   A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ),
+*     $                   VN1( * ), VN2( * )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLAQP3RK computes a step of truncated QR factorization with column
+*> pivoting of a real M-by-N matrix A block A(IOFFSET+1:M,1:N)
+*> by using Level 3 BLAS as
+*>
+*>   A * P(KB) = Q(KB) * R(KB).
+*>
+*> The routine tries to factorize NB columns from A starting from
+*> the row IOFFSET+1 and updates the residual matrix with BLAS 3
+*> xGEMM. The number of actually factorized columns is returned
+*> is smaller than NB.
+*>
+*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides B matrix stored
+*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**T * B.
+*>
+*> Cases when the number of factorized columns KB < NB:
+*>
+*> (1) In some cases, due to catastrophic cancellations, it cannot
+*> factorize all NB columns and need to update the residual matrix.
+*> Hence, the actual number of factorized columns in the block returned
+*> in KB is smaller than NB. The logical DONE is returned as FALSE.
+*> The factorization of the whole original matrix A_orig must proceed
+*> with the next block.
+*>
+*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB.
+*>
+*> (3) In case both stopping criteria ABSTOL or RELTOL are not used,
+*> and when the residual matrix is a zero matrix in some factorization
+*> step KB, the factorization of the whole original matrix A_orig is
+*> stopped, the logical DONE is returned as TRUE. The number of
+*> factorized columns which is smaller than NB is returned in KB.
+*>
+*> (4) Whenever NaN is detected in the matrix A or in the array TAU,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB. The INFO
+*> parameter is set to the column index of the first NaN occurrence.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          Factorization block size, i.e the number of columns
+*>          to factorize in the matrix A. 0 <= NB
+*>
+*>          If NB = 0, then the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine DGEQP3RK. 1 <= KP1 <= N_orig.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is DOUBLE PRECISION
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine DGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:KB) below
+*>             the diagonal together with the array TAU represent
+*>             the orthogonal matrix Q(KB) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:KB) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,KB+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(KB)**T.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out]
+*> \verbatim
+*>          DONE is LOGICAL
+*>          TRUE: a) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to ABSTOL
+*>                   or RELTOL criterion,
+*>                b) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to the
+*>                   residual matrix being a ZERO matrix.
+*>                c) when NaN was detected in the matrix A
+*>                   or in the array TAU.
+*>          FALSE: otherwise.
+*> \endverbatim
+*>
+*> \param[out] KB
+*> \verbatim
+*>          KB is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R.  0 <= KB <= min(M-IOFFSET,NB,N).
+*>
+*>          KB also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank KB. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank KB) to the maximum column 2-norm of the
+*>          original matrix A_orig. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is DOUBLE PRECISION array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] AUXV
+*> \verbatim
+*>          AUXV is DOUBLE PRECISION array, dimension (NB)
+*>          Auxiliary vector.
+*> \endverbatim
+*>
+*> \param[out] F
+*> \verbatim
+*>          F is DOUBLE PRECISION array, dimension (LDF,NB)
+*>          Matrix F**T = L*(Y**T)*A.
+*> \endverbatim
+*>
+*> \param[in] LDF
+*> \verbatim
+*>          LDF is INTEGER
+*>          The leading dimension of the array F. LDF >= max(1,N+NRHS).
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step KB+1 ( when KB columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             KB                  is set to the number of
+*>                                    factorized columns without
+*>                                    exception.
+*>             MAXC2NRMK           is set to NaN.
+*>             RELMAXC2NRMK        is set to NaN.
+*>             TAU(KB+1:min(M,N))     is not set and contains undefined
+*>                                    elements. If j_1=KB+1, TAU(KB+1)
+*>                                    may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the actorization
+*>             step KB+1 ( when KB columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp3rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      LOGICAL            DONE
+      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+     $                   NB, NRHS
+      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      DOUBLE PRECISION   A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ),
+     $                   VN1( * ), VN2( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            ITEMP, J, K, MINMNFACT, MINMNUPDT,
+     $                   LSTICC, KP, I, IF
+      DOUBLE PRECISION   AIK, HUGEVAL, TEMP, TEMP2, TOL3Z
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DGEMM, DGEMV, DLARFG, DSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      INTEGER            IDAMAX
+      DOUBLE PRECISION   DLAMCH, DNRM2
+      EXTERNAL           DISNAN, DLAMCH, IDAMAX, DNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      NB = MIN( NB, MINMNFACT )
+      TOL3Z = SQRT( DLAMCH( 'Epsilon' ) )
+      HUGEVAL = DLAMCH( 'Overflow' )
+*
+*     Compute factorization in a while loop over NB columns,
+*     K is the column index in the block A(1:M,1:N).
+*
+      K = 0
+      LSTICC = 0
+      DONE = .FALSE.
+*
+      DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 )
+         K = K + 1
+         I = IOFFSET + K
+*
+         IF( I.EQ.1 ) THEN
+*
+*           We are at the first column of the original whole matrix A_orig,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+            KP = KP1
+*
+         ELSE
+*
+*           Determine the pivot column in K-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( K-1 ) + IDAMAX( N-K+1, VN1( K ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,K:N) in step K.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains NaN, set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( DISNAN( MAXC2NRMK ) ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               INFO = KB + KP
+*
+*              Set RELMAXC2NRMK to NaN.
+*
+               RELMAXC2NRMK = MAXC2NRMK
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix contains NaN and we stop
+*              the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T.
+
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL DGEMM( 'No transpose', 'Transpose',
+     $                  M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Array TAU(KF+1:MINMNFACT) is not set and contains
+*              undefined elements.
+*
+*              Return from the routine.
+*
+               RETURN
+            END IF
+*
+*           Quick return, if the submatrix A(I:M,K:N) is
+*           a zero matrix. We need to check it only if the column index
+*           (same as row index) is larger than 1, since the condition
+*           for the whole original matrix A_orig is checked in the main
+*           routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               RELMAXC2NRMK = ZERO
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix is zero and we stop the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T.
+*
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL DGEMM( 'No transpose', 'Transpose',
+     $                  M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + K - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third tolerance stopping criteria.
+*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig;
+*
+                  KB = K - 1
+                  IF = I - 1
+*
+*              Apply the block reflector to the residual of the
+*              matrix A and the residual of the right hand sides B, if
+*              the residual matrix and and/or the residual of the right
+*              hand sides exist,  i.e. if the submatrix
+*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when
+*                 KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T.
+*
+               IF( KB.LT.MINMNUPDT ) THEN
+                  CALL DGEMM( 'No transpose', 'Transpose',
+     $                  M-IF, N+NRHS-KB, KB,-ONE, A( IF+1, 1 ), LDA,
+     $                  F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,K:N):
+*        1) swap the K-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1)
+*        3) copy the K-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than K in the next loop step.)
+*        4) Save the pivot interchange with the indices relative to the
+*           the original matrix A_orig, not the block A(1:M,1:N).
+*
+         IF( KP.NE.K ) THEN
+            CALL DSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 )
+            CALL DSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF )
+            VN1( KP ) = VN1( K )
+            VN2( KP ) = VN2( K )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( K )
+            JPIV( K ) = ITEMP
+         END IF
+*
+*        Apply previous Householder reflectors to column K:
+*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T.
+*
+         IF( K.GT.1 ) THEN
+            CALL DGEMV( 'No transpose', M-I+1, K-1, -ONE, A( I, 1 ),
+     $                  LDA, F( K, 1 ), LDF, ONE, A( I, K ), 1 )
+         END IF
+*
+*        Generate elementary reflector H(k) using the column A(I:M,K).
+*
+         IF( I.LT.M ) THEN
+            CALL DLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) )
+         ELSE
+            TAU( K ) = ZERO
+         END IF
+*
+*        Check if TAU(K) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(K) for Inf,
+*        since DLARFG cannot produce TAU(K) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by DLARFG can contain Inf, which requires
+*        TAU(K) to contain NaN. Therefore, this case of generating Inf
+*        by DLARFG is covered by checking TAU(K) for NaN.
+*
+         IF( DISNAN( TAU(K) ) ) THEN
+*
+            DONE = .TRUE.
+*
+*           Set KB, the number of factorized partial columns
+*                   that are non-zero in each step in the block,
+*                   i.e. the rank of the factor R.
+*           Set IF, the number of processed rows in the block, which
+*                   is the same as the number of processed rows in
+*                   the original whole matrix A_orig.
+*
+            KB = K - 1
+            IF = I - 1
+            INFO = K
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAU( K )
+            RELMAXC2NRMK = TAU( K )
+*
+*           There is no need to apply the block reflector to the
+*           residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*           since the submatrix contains NaN and we stop
+*           the computation.
+*           But, we need to apply the block reflector to the residual
+*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*           residual right hand sides exist.  This occurs
+*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T.
+*
+            IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+               CALL DGEMM( 'No transpose', 'Transpose',
+     $               M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA,
+     $               F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA )
+            END IF
+*
+*           There is no need to recompute the 2-norm of the
+*           difficult columns, since we stop the factorization.
+*
+*           Array TAU(KF+1:MINMNFACT) is not set and contains
+*           undefined elements.
+*
+*           Return from the routine.
+*
+            RETURN
+         END IF
+*
+*        ===============================================================
+*
+         AIK = A( I, K )
+         A( I, K ) = ONE
+*
+*        ===============================================================
+*
+*        Compute the current K-th column of F:
+*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K).
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL DGEMV( 'Transpose', M-I+1, N+NRHS-K,
+     $                  TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1,
+     $                  ZERO, F( K+1, K ), 1 )
+         END IF
+*
+*           2) Zero out elements above and on the diagonal of the
+*              column K in matrix F, i.e elements F(1:K,K).
+*
+         DO J = 1, K
+            F( J, K ) = ZERO
+         END DO
+*
+*         3) Incremental updating of the K-th column of F:
+*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T
+*                    * A(I:M,K).
+*
+         IF( K.GT.1 ) THEN
+            CALL DGEMV( 'Transpose', M-I+1, K-1, -TAU( K ),
+     $                  A( I, 1 ), LDA, A( I, K ), 1, ZERO,
+     $                  AUXV( 1 ), 1 )
+*
+            CALL DGEMV( 'No transpose', N+NRHS, K-1, ONE,
+     $                  F( 1, 1 ), LDF, AUXV( 1 ), 1, ONE,
+     $                  F( 1, K ), 1 )
+         END IF
+*
+*        ===============================================================
+*
+*        Update the current I-th row of A:
+*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS)
+*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**T.
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL DGEMV( 'No transpose', N+NRHS-K, K, -ONE,
+     $                  F( K+1, 1 ), LDF, A( I, 1 ), LDA, ONE,
+     $                  A( I, K+1 ), LDA )
+         END IF
+*
+         A( I, K ) = AIK
+*
+*        Update the partial column 2-norms for the residual matrix,
+*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e.
+*        when K < MINMNFACT = min( M-IOFFSET, N ).
+*
+         IF( K.LT.MINMNFACT ) THEN
+*
+            DO J = K + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ABS( A( I, J ) ) / VN1( J )
+                  TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2.LE.TOL3Z ) THEN
+*
+*                    At J-index, we have a difficult column for the
+*                    update of the 2-norm. Save the index of the previous
+*                    difficult column in IWORK(J-1).
+*                    NOTE: ILSTCC > 1, threfore we can use IWORK only
+*                    with N-1 elements, where the elements are
+*                    shifted by 1 to the left.
+*
+                     IWORK( J-1 ) = LSTICC
+*
+*                    Set the index of the last difficult column LSTICC.
+*
+                     LSTICC = J
+*
+                  ELSE
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*        End of while loop.
+*
+      END DO
+*
+*     Now, afler the loop:
+*        Set KB, the number of factorized columns in the block;
+*        Set IF, the number of processed rows in the block, which
+*                is the same as the number of processed rows in
+*                the original whole matrix A_orig, IF = IOFFSET + KB.
+*
+      KB = K
+      IF = I
+*
+*     Apply the block reflector to the residual of the matrix A
+*     and the residual of the right hand sides B, if the residual
+*     matrix and and/or the residual of the right hand sides
+*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists.
+*     This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T.
+*
+      IF( KB.LT.MINMNUPDT ) THEN
+         CALL DGEMM( 'No transpose', 'Transpose',
+     $         M-IF, N+NRHS-KB, KB, -ONE, A( IF+1, 1 ), LDA,
+     $         F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA )
+      END IF
+*
+*     Recompute the 2-norm of the difficult columns.
+*     Loop over the index of the difficult columns from the largest
+*     to the smallest index.
+*
+      DO WHILE( LSTICC.GT.0 )
+*
+*        LSTICC is the index of the last difficult column is greater
+*        than 1.
+*        ITEMP is the index of the previous difficult column.
+*
+         ITEMP = IWORK( LSTICC-1 )
+*
+*        Compute the 2-norm explicilty for the last difficult column and
+*        save it in the partial and exact 2-norm vectors VN1 and VN2.
+*
+*        NOTE: The computation of VN1( LSTICC ) relies on the fact that
+*        DNRM2 does not fail on vectors with norm below the value of
+*        SQRT(DLAMCH('S'))
+*
+         VN1( LSTICC ) = DNRM2( M-IF, A( IF+1, LSTICC ), 1 )
+         VN2( LSTICC ) = VN1( LSTICC )
+*
+*        Downdate the index of the last difficult column to
+*        the index of the previous difficult column.
+*
+         LSTICC = ITEMP
+*
+      END DO
+*
+      RETURN
+*
+*     End of DLAQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/dlaswlq.f b/lapack-netlib/SRC/dlaswlq.f
index c95c94cbc..636c12dc8 100644
--- a/lapack-netlib/SRC/dlaswlq.f
+++ b/lapack-netlib/SRC/dlaswlq.f
@@ -99,19 +99,22 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= MB*M.
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -159,33 +162,37 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup laswlq
+*>
 *  =====================================================================
       SUBROUTINE DLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
-     $                  INFO)
+     $                    INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LWORK, LDT
+      INTEGER            INFO, LDA, M, N, MB, NB, LWORK, LDT
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION  A( LDA, * ), WORK( * ), T( LDT, *)
+      DOUBLE PRECISION   A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, MINMN, LWMIN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
       EXTERNAL           DGELQT, DTPLQT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -196,12 +203,19 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = M*MB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
       ELSE IF( N.LT.0 .OR. N.LT.M ) THEN
         INFO = -2
-      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN
+      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN
         INFO = -3
       ELSE IF( NB.LT.0 ) THEN
         INFO = -4
@@ -209,60 +223,62 @@
         INFO = -6
       ELSE IF( LDT.LT.MB ) THEN
         INFO = -8
-      ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-      WORK(1) = MB*M
+*
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'DLASWLQ', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The LQ Decomposition
 *
-       IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN
-        CALL DGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO)
+      IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN
+        CALL DGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
-       KK = MOD((N-M),(NB-M))
-       II=N-KK+1
+      KK = MOD((N-M),(NB-M))
+      II = N-KK+1
 *
-*      Compute the LQ factorization of the first block A(1:M,1:NB)
+*     Compute the LQ factorization of the first block A(1:M,1:NB)
 *
-       CALL DGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO)
-       CTR = 1
+      CALL DGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CTR = 1
 *
-       DO I = NB+1, II-NB+M , (NB-M)
+      DO I = NB+1, II-NB+M, (NB-M)
 *
-*      Compute the QR factorization of the current block A(1:M,I:I+NB-M)
+*       Compute the QR factorization of the current block A(1:M,I:I+NB-M)
 *
-         CALL DTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
-     $                  LDA, T(1, CTR * M + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CALL DTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
+     $                 LDA, T(1, CTR * M + 1),
+     $                 LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
 *     Compute the QR factorization of the last block A(1:M,II:N)
 *
-       IF (II.LE.N) THEN
+      IF( II.LE.N ) THEN
         CALL DTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ),
-     $                  LDA, T(1, CTR * M + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+     $                 LDA, T(1, CTR * M + 1), LDT,
+     $                 WORK, INFO )
+      END IF
+*
+      WORK( 1 ) = LWMIN
 *
-      WORK( 1 ) = M * MB
       RETURN
 *
 *     End of DLASWLQ
diff --git a/lapack-netlib/SRC/dlatrs3.f b/lapack-netlib/SRC/dlatrs3.f
index e6d78b672..d18675b2d 100644
--- a/lapack-netlib/SRC/dlatrs3.f
+++ b/lapack-netlib/SRC/dlatrs3.f
@@ -151,13 +151,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (LWORK).
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal size of
 *>          WORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
+*> \verbatim
 *>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else
 *>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
 *>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
 *>
@@ -165,6 +169,7 @@
 *>          only calculates the optimal dimensions of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
@@ -181,7 +186,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERauxiliary
+*> \ingroup latrs3
 *> \par Further Details:
 *  =====================
 *  \verbatim
@@ -253,7 +258,7 @@
       LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
       INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
      $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
-     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
+     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN
       DOUBLE PRECISION   ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
      $                   SCAMIN, SMLNUM, TMAX
 *     ..
@@ -292,15 +297,24 @@
 *     row. WORK( I+KK*LDS ) is the scale factor of the vector
 *     segment associated with the I-th block row and the KK-th vector
 *     in the block column.
+*
       LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
       LDS = NBA
+*
 *     The second part stores upper bounds of the triangular A. There are
 *     a total of NBA x NBA blocks, of which only the upper triangular
 *     part or the lower triangular part is referenced. The upper bound of
 *     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
+*
       LANRM = NBA * NBA
       AWRK = LSCALE
-      WORK( 1 ) = LSCALE + LANRM
+*
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = LSCALE + LANRM
+      END IF
+      WORK( 1 ) = LWMIN
 *
 *     Test the input parameters
 *
@@ -322,7 +336,7 @@
          INFO = -8
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -10
-      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
+      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN
          INFO = -14
       END IF
       IF( INFO.NE.0 ) THEN
@@ -649,6 +663,9 @@
             END IF
          END DO
       END DO
+*
+      WORK( 1 ) = LWMIN
+*
       RETURN
 *
 *     End of DLATRS3
diff --git a/lapack-netlib/SRC/dlatsqr.f b/lapack-netlib/SRC/dlatsqr.f
index 94a04be02..0000aab68 100644
--- a/lapack-netlib/SRC/dlatsqr.f
+++ b/lapack-netlib/SRC/dlatsqr.f
@@ -101,15 +101,18 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= NB*N.
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -161,27 +164,29 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup latsqr
+*>
 *  =====================================================================
       SUBROUTINE DLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK,
-     $                    LWORK, INFO)
+     $                    LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LDT, LWORK
+      INTEGER            INFO, LDA, M, N, MB, NB, LDT, LWORK
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION  A( LDA, * ), WORK( * ), T(LDT, *)
+      DOUBLE PRECISION   A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, MINMN, LWMIN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
@@ -198,6 +203,13 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = N*NB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
@@ -205,65 +217,67 @@
         INFO = -2
       ELSE IF( MB.LT.1 ) THEN
         INFO = -3
-      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN
+      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN
         INFO = -4
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
       ELSE IF( LDT.LT.NB ) THEN
         INFO = -8
-      ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-        WORK(1) = NB*N
+*
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = LWMIN
       END IF
+*
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'DLATSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The QR Decomposition
 *
-       IF ((MB.LE.N).OR.(MB.GE.M)) THEN
-         CALL DGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO)
-         RETURN
-       END IF
+      IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN
+        CALL DGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO )
+        RETURN
+      END IF
 *
-       KK = MOD((M-N),(MB-N))
-       II=M-KK+1
+      KK = MOD((M-N),(MB-N))
+      II = M-KK+1
 *
-*      Compute the QR factorization of the first block A(1:MB,1:N)
+*     Compute the QR factorization of the first block A(1:MB,1:N)
 *
-       CALL DGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CALL DGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
 *
-       CTR = 1
-       DO I = MB+1, II-MB+N ,  (MB-N)
+      CTR = 1
+      DO I = MB+1, II-MB+N, (MB-N)
 *
-*      Compute the QR factorization of the current block A(I:I+MB-N,1:N)
+*       Compute the QR factorization of the current block A(I:I+MB-N,1:N)
 *
-         CALL DTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
-     $                 T(1, CTR * N + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CALL DTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
+     $                T(1, CTR * N + 1),
+     $                LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
-*      Compute the QR factorization of the last block A(II:M,1:N)
+*     Compute the QR factorization of the last block A(II:M,1:N)
 *
-       IF (II.LE.M) THEN
-         CALL DTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
-     $                 T(1, CTR * N + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+      IF( II.LE.M ) THEN
+        CALL DTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
+     $                T(1, CTR * N + 1), LDT,
+     $                WORK, INFO )
+      END IF
 *
-      WORK( 1 ) = N*NB
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of DLATSQR
diff --git a/lapack-netlib/SRC/dsyev_2stage.f b/lapack-netlib/SRC/dsyev_2stage.f
index 50d51d992..286366bfe 100644
--- a/lapack-netlib/SRC/dsyev_2stage.f
+++ b/lapack-netlib/SRC/dsyev_2stage.f
@@ -20,7 +20,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, 
+*       SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
 *                                INFO )
 *
 *       IMPLICIT NONE
@@ -97,7 +97,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension LWORK
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -105,12 +105,12 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + 2*N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + 2*N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -143,7 +143,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYeigen
+*> \ingroup heev_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -161,7 +161,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -169,16 +169,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
 *  =====================================================================
-      SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, 
+      SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
      $                         INFO )
 *
       IMPLICIT NONE
@@ -305,7 +305,7 @@
       LLWORK  = LWORK - INDWRK + 1
 *
       CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ),
-     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWRK ), LLWORK, IINFO )
 *
 *     For eigenvalues only, call DSTERF.  For eigenvectors, first call
diff --git a/lapack-netlib/SRC/dsyevd.f b/lapack-netlib/SRC/dsyevd.f
index b27f4cdc7..adcfcb373 100644
--- a/lapack-netlib/SRC/dsyevd.f
+++ b/lapack-netlib/SRC/dsyevd.f
@@ -96,8 +96,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array,
-*>                                         dimension (LWORK)
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -160,7 +159,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYeigen
+*> \ingroup heevd
 *
 *> \par Contributors:
 *  ==================
diff --git a/lapack-netlib/SRC/dsyevr.f b/lapack-netlib/SRC/dsyevr.f
index 698691533..8647b0162 100644
--- a/lapack-netlib/SRC/dsyevr.f
+++ b/lapack-netlib/SRC/dsyevr.f
@@ -271,7 +271,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,26*N).
+*>          The dimension of the array WORK.
+*>          If N <= 1, LWORK >= 1, else LWORK >= 26*N.
 *>          For optimal efficiency, LWORK >= (NB+6)*N,
 *>          where NB is the max of the blocksize for DSYTRD and DORMTR
 *>          returned by ILAENV.
@@ -285,13 +286,14 @@
 *> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
-*>          On exit, if INFO = 0, IWORK(1) returns the optimal LWORK.
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
 *> \endverbatim
 *>
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the IWORK array,
@@ -315,7 +317,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYeigen
+*> \ingroup heevr
 *
 *> \par Contributors:
 *  ==================
@@ -390,8 +392,13 @@
 *
       LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) )
 *
-      LWMIN = MAX( 1, 26*N )
-      LIWMIN = MAX( 1, 10*N )
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = 26*N
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN
@@ -450,7 +457,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 7
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = A( 1, 1 )
diff --git a/lapack-netlib/SRC/dsyevr_2stage.f b/lapack-netlib/SRC/dsyevr_2stage.f
index 09242bbd3..63d5e3159 100644
--- a/lapack-netlib/SRC/dsyevr_2stage.f
+++ b/lapack-netlib/SRC/dsyevr_2stage.f
@@ -263,7 +263,7 @@
 *>          indicating the nonzero elements in Z. The i-th eigenvector
 *>          is nonzero only in elements ISUPPZ( 2*i-1 ) through
 *>          ISUPPZ( 2*i ). This is an output of DSTEMR (tridiagonal
-*>          matrix). The support of the eigenvectors of A is typically 
+*>          matrix). The support of the eigenvectors of A is typically
 *>          1:N because of the orthogonal transformations applied by DORMTR.
 *>          Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
 *> \endverbatim
@@ -277,12 +277,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  
+*>          The dimension of the array WORK.
+*>          If N <= 1,               LWORK must be at least 1.
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 26*N, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + 5*N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + 5*N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -300,13 +301,14 @@
 *> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
-*>          On exit, if INFO = 0, IWORK(1) returns the optimal LWORK.
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
 *> \endverbatim
 *>
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the IWORK array,
@@ -330,7 +332,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYeigen
+*> \ingroup heevr_2stage
 *
 *> \par Contributors:
 *  ==================
@@ -358,7 +360,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -366,11 +368,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -444,8 +446,14 @@
       IB     = ILAENV2STAGE( 2, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
       LHTRD  = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
       LWTRD  = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
-      LWMIN  = MAX( 26*N, 5*N + LHTRD + LWTRD )
-      LIWMIN = MAX( 1, 10*N )
+*
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = MAX( 26*N, 5*N + LHTRD + LWTRD )
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
@@ -484,7 +492,7 @@
 *         NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 )
 *         NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) )
 *         LWKOPT = MAX( ( NB+1 )*N, LWMIN )
-         WORK( 1 ) = LWMIN
+         WORK( 1 )  = LWMIN
          IWORK( 1 ) = LIWMIN
       END IF
 *
@@ -504,7 +512,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 7
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = A( 1, 1 )
@@ -608,7 +616,7 @@
 *     Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
 *
 *
-      CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), 
+      CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ),
      $                    WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ),
      $                    LHTRD, WORK( INDWK ), LLWORK, IINFO )
 *
@@ -727,7 +735,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 )  = LWMIN
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/dsyevx.f b/lapack-netlib/SRC/dsyevx.f
index 99719874b..fd6a78e32 100644
--- a/lapack-netlib/SRC/dsyevx.f
+++ b/lapack-netlib/SRC/dsyevx.f
@@ -244,7 +244,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYeigen
+*> \ingroup heevx
 *
 *  =====================================================================
       SUBROUTINE DSYEVX( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, IL, IU,
@@ -338,14 +338,14 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWKMIN = 1
-            WORK( 1 ) = LWKMIN
+            LWKOPT = 1
          ELSE
             LWKMIN = 8*N
             NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 )
             NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) )
             LWKOPT = MAX( LWKMIN, ( NB + 3 )*N )
-            WORK( 1 ) = LWKOPT
          END IF
+         WORK( 1 ) = LWKOPT
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY )
      $      INFO = -17
diff --git a/lapack-netlib/SRC/dsysv_aa.f b/lapack-netlib/SRC/dsysv_aa.f
index 8dab5a384..0a96ecd7e 100644
--- a/lapack-netlib/SRC/dsysv_aa.f
+++ b/lapack-netlib/SRC/dsysv_aa.f
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYsolve
+*> \ingroup hesv_aa
 *
 *  =====================================================================
       SUBROUTINE DSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
@@ -177,7 +177,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS
+      INTEGER            LWKMIN, LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -196,6 +196,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N, 3*N-2 )
       IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -206,17 +207,17 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-         LWKOPT_SYTRF = INT( WORK(1) )
+         LWKOPT_SYTRF = INT( WORK( 1 ) )
          CALL DSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
      $                   -1, INFO )
-         LWKOPT_SYTRS = INT( WORK(1) )
-         LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS )
+         LWKOPT_SYTRS = INT( WORK( 1 ) )
+         LWKOPT = MAX( LWKMIN, LWKOPT_SYTRF, LWKOPT_SYTRS )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/dsysv_aa_2stage.f b/lapack-netlib/SRC/dsysv_aa_2stage.f
index 72fbe1e9a..90dd0a38a 100644
--- a/lapack-netlib/SRC/dsysv_aa_2stage.f
+++ b/lapack-netlib/SRC/dsysv_aa_2stage.f
@@ -101,14 +101,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is DOUBLE PRECISION array, dimension (LTB)
+*>          TB is DOUBLE PRECISION array, dimension (MAX(1,LTB))
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -148,14 +148,15 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION workspace of size LWORK
+*>          WORK is DOUBLE PRECISION workspace of size (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used to
+*>          select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -179,7 +180,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYsolve
+*> \ingroup hesv_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE DSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB,
@@ -205,7 +206,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            UPPER, TQUERY, WQUERY
-      INTEGER            LWKOPT
+      INTEGER            LWKMIN, LWKOPT
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -226,6 +227,7 @@
       UPPER = LSAME( UPLO, 'U' )
       WQUERY = ( LWORK.EQ.-1 )
       TQUERY = ( LTB.EQ.-1 )
+      LWKMIN = MAX( 1, N )
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -234,18 +236,19 @@
          INFO = -3
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -5
-      ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -7
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -11
-      ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN
          INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV,
      $                          IPIV2, WORK, -1, INFO )
-         LWKOPT = INT( WORK(1) )
+         LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) )
+         WORK( 1 ) = LWKOPT
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -255,7 +258,6 @@
          RETURN
       END IF
 *
-*
 *     Compute the factorization A = U**T*T*U or A = L*T*L**T.
 *
       CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2,
diff --git a/lapack-netlib/SRC/dsysvx.f b/lapack-netlib/SRC/dsysvx.f
index a30831e72..b2b8210ca 100644
--- a/lapack-netlib/SRC/dsysvx.f
+++ b/lapack-netlib/SRC/dsysvx.f
@@ -275,7 +275,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYsolve
+*> \ingroup hesvx
 *
 *  =====================================================================
       SUBROUTINE DSYSVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B,
@@ -305,7 +305,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, NOFACT
-      INTEGER            LWKOPT, NB
+      INTEGER            LWKMIN, LWKOPT, NB
       DOUBLE PRECISION   ANORM
 *     ..
 *     .. External Functions ..
@@ -327,6 +327,7 @@
       INFO = 0
       NOFACT = LSAME( FACT, 'N' )
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 3*N )
       IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN
          INFO = -1
       ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) )
@@ -344,12 +345,12 @@
          INFO = -11
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -13
-      ELSE IF( LWORK.LT.MAX( 1, 3*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -18
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = MAX( 1, 3*N )
+         LWKOPT = LWKMIN
          IF( NOFACT ) THEN
             NB = ILAENV( 1, 'DSYTRF', UPLO, N, -1, -1, -1 )
             LWKOPT = MAX( LWKOPT, N*NB )
diff --git a/lapack-netlib/SRC/dsytrd.f b/lapack-netlib/SRC/dsytrd.f
index 3dcfc3db2..58d4b633b 100644
--- a/lapack-netlib/SRC/dsytrd.f
+++ b/lapack-netlib/SRC/dsytrd.f
@@ -139,7 +139,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrd
 *
 *> \par Further Details:
 *  =====================
@@ -247,7 +247,7 @@
 *        Determine the block size.
 *
          NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/dsytrd_2stage.f b/lapack-netlib/SRC/dsytrd_2stage.f
index 8ae77d3e4..a88ac1c73 100644
--- a/lapack-netlib/SRC/dsytrd_2stage.f
+++ b/lapack-netlib/SRC/dsytrd_2stage.f
@@ -4,23 +4,23 @@
 *
 *  =========== DOCUMENTATION ===========
 *
-* Online html documentation available at 
-*            http://www.netlib.org/lapack/explore-html/ 
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
-*> Download DSYTRD_2STAGE + dependencies 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsytrd_2stage.f"> 
-*> [TGZ]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsytrd_2stage.f"> 
-*> [ZIP]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsytrd_2stage.f"> 
+*> Download DSYTRD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsytrd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsytrd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsytrd_2stage.f">
 *> [TXT]</a>
-*> \endhtmlonly 
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+*       SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
 *                                 HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
 *       IMPLICIT NONE
@@ -34,7 +34,7 @@
 *       DOUBLE PRECISION   A( LDA, * ), TAU( * ),
 *                          HOUS2( * ), WORK( * )
 *       ..
-*  
+*
 *
 *> \par Purpose:
 *  =============
@@ -52,11 +52,11 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  in particular for the second stage (Band to
 *>                  tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate Q1 Q2 or to apply Q1 Q2, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate Q1 Q2 or to apply Q1 Q2,
 *>                  then LHOUS2 is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -86,7 +86,7 @@
 *>          triangular part of A is not referenced.
 *>          On exit, if UPLO = 'U', the band superdiagonal
 *>          of A are overwritten by the corresponding elements of the
-*>          internal band-diagonal matrix AB, and the elements above 
+*>          internal band-diagonal matrix AB, and the elements above
 *>          the KD superdiagonal, with the array TAU, represent the orthogonal
 *>          matrix Q1 as a product of elementary reflectors; if UPLO
 *>          = 'L', the diagonal and band subdiagonal of A are over-
@@ -117,13 +117,13 @@
 *> \param[out] TAU
 *> \verbatim
 *>          TAU is DOUBLE PRECISION array, dimension (N-KD)
-*>          The scalar factors of the elementary reflectors of 
+*>          The scalar factors of the elementary reflectors of
 *>          the first stage (see Further Details).
 *> \endverbatim
 *>
 *> \param[out] HOUS2
 *> \verbatim
-*>          HOUS2 is DOUBLE PRECISION array, dimension (LHOUS2)
+*>          HOUS2 is DOUBLE PRECISION array, dimension (MAX(1,LHOUS2))
 *>          Stores the Householder representation of the stage2
 *>          band to tridiagonal.
 *> \endverbatim
@@ -132,6 +132,8 @@
 *> \verbatim
 *>          LHOUS2 is INTEGER
 *>          The dimension of the array HOUS2.
+*>          LHOUS2 >= 1.
+*>
 *>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS2 array, returns
@@ -143,23 +145,26 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (LWORK)
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS2=-1,
+*>          The dimension of the array WORK.
+*>          If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *>          LWORK = MAX(1, dimension) where
 *>          dimension   = max(stage1,stage2) + (KD+1)*N
-*>                      = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                        + max(2*KD*KD, KD*NTHREADS) 
-*>                        + (KD+1)*N 
+*>                      = N*KD + N*max(KD+1,FACTOPTNB)
+*>                        + max(2*KD*KD, KD*NTHREADS)
+*>                        + (KD+1)*N
 *>          where KD is the blocking size of the reduction,
 *>          FACTOPTNB is the blocking used by the QR or LQ
 *>          algorithm, usually FACTOPTNB=128 is a good choice
@@ -177,12 +182,12 @@
 *  Authors:
 *  ========
 *
-*> \author Univ. of Tennessee 
-*> \author Univ. of California Berkeley 
-*> \author Univ. of Colorado Denver 
-*> \author NAG Ltd. 
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrd_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -202,7 +207,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -210,16 +215,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+      SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
      $                          HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
       IMPLICIT NONE
@@ -265,10 +270,13 @@
 *
       KD     = ILAENV2STAGE( 1, 'DSYTRD_2STAGE', VECT, N, -1, -1, -1 )
       IB     = ILAENV2STAGE( 2, 'DSYTRD_2STAGE', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 )
-*      WRITE(*,*),'DSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
-*     $            LHMIN, LWMIN
+      IF( N.EQ.0 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.LSAME( VECT, 'N' ) ) THEN
          INFO = -1
@@ -309,14 +317,14 @@
       LWRK  = LWORK-LDAB*N
       ABPOS = 1
       WPOS  = ABPOS + LDAB*N
-      CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, 
+      CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB,
      $                   TAU, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DSYTRD_SY2SB', -INFO )
          RETURN
       END IF
-      CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, 
-     $                   WORK( ABPOS ), LDAB, D, E, 
+      CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD,
+     $                   WORK( ABPOS ), LDAB, D, E,
      $                   HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DSYTRD_SB2ST', -INFO )
@@ -324,8 +332,7 @@
       END IF
 *
 *
-      HOUS2( 1 ) = LHMIN
-      WORK( 1 )  = LWMIN
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of DSYTRD_2STAGE
diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F
index bb74dd491..04d03d587 100644
--- a/lapack-netlib/SRC/dsytrd_sb2st.F
+++ b/lapack-netlib/SRC/dsytrd_sb2st.F
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+*       SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
 *                               D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 *       #if defined(_OPENMP)
@@ -53,12 +53,12 @@
 *> \param[in] STAGE1
 *> \verbatim
 *>          STAGE1 is CHARACTER*1
-*>          = 'N':  "No": to mention that the stage 1 of the reduction  
+*>          = 'N':  "No": to mention that the stage 1 of the reduction
 *>                  from dense to band using the dsytrd_sy2sb routine
-*>                  was not called before this routine to reproduce AB. 
-*>                  In other term this routine is called as standalone. 
-*>          = 'Y':  "Yes": to mention that the stage 1 of the 
-*>                  reduction from dense to band using the dsytrd_sy2sb 
+*>                  was not called before this routine to reproduce AB.
+*>                  In other term this routine is called as standalone.
+*>          = 'Y':  "Yes": to mention that the stage 1 of the
+*>                  reduction from dense to band using the dsytrd_sy2sb
 *>                  routine has been called to produce AB (e.g., AB is
 *>                  the output of dsytrd_sy2sb.
 *> \endverbatim
@@ -66,10 +66,10 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  and thus LHOUS is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate or to apply Q later on, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate or to apply Q later on,
 *>                  then LHOUS is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -132,34 +132,39 @@
 *>
 *> \param[out] HOUS
 *> \verbatim
-*>          HOUS is DOUBLE PRECISION array, dimension LHOUS, that
-*>          store the Householder representation.
+*>          HOUS is DOUBLE PRECISION array, dimension (MAX(1,LHOUS))
+*>          Stores the Householder representation.
 *> \endverbatim
 *>
 *> \param[in] LHOUS
 *> \verbatim
 *>          LHOUS is INTEGER
-*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array HOUS.
+*>          If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS array, returns
 *>          this value as the first entry of the HOUS array, and no error
 *>          message related to LHOUS is issued by XERBLA.
 *>          LHOUS = MAX(1, dimension) where
 *>          dimension = 4*N if VECT='N'
-*>          not available now if VECT='H'     
+*>          not available now if VECT='H'
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension LWORK.
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array WORK.
+*>          If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -188,7 +193,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup real16OTHERcomputational
+*> \ingroup hetrd_hb2st
 *
 *> \par Further Details:
 *  =====================
@@ -208,7 +213,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -216,16 +221,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+      SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
      $                         D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 #if defined(_OPENMP)
@@ -258,11 +263,11 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1
-      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, 
+      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
      $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
      $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
-     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
-     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, 
+     $                   NBTILES, TTYPE, TID, NTHREADS,
+     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
      $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
      $                   SIDEV, SIZETAU, LDV, LHMIN, LWMIN
 *     ..
@@ -274,7 +279,7 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      INTEGER            ILAENV2STAGE 
+      INTEGER            ILAENV2STAGE
       EXTERNAL           LSAME, ILAENV2STAGE
 *     ..
 *     .. Executable Statements ..
@@ -282,7 +287,6 @@
 *     Determine the minimal workspace size required.
 *     Test the input parameters
 *
-      DEBUG   = 0
       INFO    = 0
       AFTERS1 = LSAME( STAGE1, 'Y' )
       WANTQ   = LSAME( VECT, 'V' )
@@ -291,9 +295,14 @@
 *
 *     Determine the block size, the workspace size and the hous size.
 *
-      IB     = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+      IB       = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 )
+      IF( N.EQ.0 .OR. KD.LE.1 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
          INFO = -1
@@ -355,7 +364,7 @@
           ABDPOS   = KD + 1
           ABOFDPOS = KD
       ELSE
-          APOS     = INDA 
+          APOS     = INDA
           AWPOS    = INDA + KD + 1
           DPOS     = APOS
           OFDPOS   = DPOS + 1
@@ -363,11 +372,11 @@
           ABOFDPOS = 2
 
       ENDIF
-*      
-*     Case KD=0: 
-*     The matrix is diagonal. We just copy it (convert to "real" for 
-*     real because D is double and the imaginary part should be 0) 
-*     and store it in D. A sequential code here is better or 
+*
+*     Case KD=0:
+*     The matrix is diagonal. We just copy it (convert to "real" for
+*     real because D is double and the imaginary part should be 0)
+*     and store it in D. A sequential code here is better or
 *     in a parallel environment it might need two cores for D and E
 *
       IF( KD.EQ.0 ) THEN
@@ -382,17 +391,17 @@
           WORK( 1 ) = 1
           RETURN
       END IF
-*      
-*     Case KD=1: 
-*     The matrix is already Tridiagonal. We have to make diagonal 
+*
+*     Case KD=1:
+*     The matrix is already Tridiagonal. We have to make diagonal
 *     and offdiagonal elements real, and store them in D and E.
-*     For that, for real precision just copy the diag and offdiag 
-*     to D and E while for the COMPLEX case the bulge chasing is  
-*     performed to convert the hermetian tridiagonal to symmetric 
-*     tridiagonal. A simpler conversion formula might be used, but then 
+*     For that, for real precision just copy the diag and offdiag
+*     to D and E while for the COMPLEX case the bulge chasing is
+*     performed to convert the hermetian tridiagonal to symmetric
+*     tridiagonal. A simpler conversion formula might be used, but then
 *     updating the Q matrix will be required and based if Q is generated
-*     or not this might complicate the story. 
-*      
+*     or not this might complicate the story.
+*
       IF( KD.EQ.1 ) THEN
           DO 50 I = 1, N
               D( I ) = ( AB( ABDPOS, I ) )
@@ -413,7 +422,7 @@
           RETURN
       END IF
 *
-*     Main code start here. 
+*     Main code start here.
 *     Reduce the symmetric band of A to a tridiagonal matrix.
 *
       THGRSIZ   = N
@@ -422,7 +431,7 @@
       NBTILES   = CEILING( REAL(N)/REAL(KD) )
       STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
       THGRNB    = CEILING( REAL(N-1)/REAL(THGRSIZ) )
-*      
+*
       CALL DLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
       CALL DLASET( "A", KD,   N, ZERO, ZERO, WORK( AWPOS ), LDA )
 *
@@ -431,7 +440,7 @@
 *
 #if defined(_OPENMP)
 !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
-!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) 
+!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
 !$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
 !$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
 !$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
@@ -440,7 +449,7 @@
 #endif
 *
 *     main bulge chasing loop
-*      
+*
       DO 100 THGRID = 1, THGRNB
           STT  = (THGRID-1)*THGRSIZ+1
           THED = MIN( (STT + THGRSIZ -1), (N-1))
@@ -451,7 +460,7 @@
                   ST = STT
                   DO 130 SWEEPID = ST, ED
                       DO 140 K = 1, GRSIZ
-                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ) 
+                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ)
      $                           + (M-1)*GRSIZ + K
                           IF ( MYID.EQ.1 ) THEN
                               TTYPE = 1
@@ -477,16 +486,16 @@
                           ENDIF
 *
 *                         Call the kernel
-*                             
+*
 #if defined(_OPENMP) &&  _OPENMP >= 201307
-                          IF( TTYPE.NE.1 ) THEN      
+                          IF( TTYPE.NE.1 ) THEN
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(in:WORK(MYID-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
@@ -494,20 +503,20 @@
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
                           ENDIF
 #else
-                          CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                          CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                         STIND, EDIND, SWEEPID, N, KD, IB,
-     $                         WORK ( INDA ), LDA, 
+     $                         WORK ( INDA ), LDA,
      $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                         WORK( INDW ) )
-#endif 
+#endif
                           IF ( BLKLASTIND.GE.(N-1) ) THEN
                               STT = STT + 1
                               EXIT
@@ -522,14 +531,14 @@
 !$OMP END MASTER
 !$OMP END PARALLEL
 #endif
-*      
+*
 *     Copy the diagonal from A to D. Note that D is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
       DO 150 I = 1, N
           D( I ) = ( WORK( DPOS+(I-1)*LDA ) )
   150 CONTINUE
-*      
+*
 *     Copy the off diagonal from A to E. Note that E is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
@@ -543,11 +552,10 @@
   170     CONTINUE
       ENDIF
 *
-      HOUS( 1 ) = LHMIN
       WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of DSYTRD_SB2ST
 *
       END
-      
+
diff --git a/lapack-netlib/SRC/dsytrd_sy2sb.f b/lapack-netlib/SRC/dsytrd_sy2sb.f
index 1660b5c7e..38acc71f1 100644
--- a/lapack-netlib/SRC/dsytrd_sy2sb.f
+++ b/lapack-netlib/SRC/dsytrd_sy2sb.f
@@ -123,8 +123,8 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (LWORK)
-*>          On exit, if INFO = 0, or if LWORK=-1, 
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, or if LWORK = -1,
 *>          WORK(1) returns the size of LWORK.
 *> \endverbatim
 *>
@@ -132,7 +132,9 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK which should be calculated
-*>          by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*>          by a workspace query.
+*>          If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY)
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -158,7 +160,7 @@
 *> \author Univ. of Colorado Denver 
 *> \author NAG Ltd. 
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrd_he2hb
 *
 *> \par Further Details:
 *  =====================
@@ -293,8 +295,12 @@
       INFO   = 0
       UPPER  = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
-      LWMIN  = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', '', N, KD, -1, -1 )
-      
+      IF( N.LE.KD+1 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', ' ', N, KD, -1, -1 )
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/dsytrf.f b/lapack-netlib/SRC/dsytrf.f
index aee9b3f6a..2a1a2d4dc 100644
--- a/lapack-netlib/SRC/dsytrf.f
+++ b/lapack-netlib/SRC/dsytrf.f
@@ -107,7 +107,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
@@ -135,7 +135,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrf
 *
 *> \par Further Details:
 *  =====================
@@ -352,6 +352,7 @@
       END IF
 *
    40 CONTINUE
+*
       WORK( 1 ) = LWKOPT
       RETURN
 *
diff --git a/lapack-netlib/SRC/dsytrf_aa.f b/lapack-netlib/SRC/dsytrf_aa.f
index 9a0b26ce5..924d4c165 100644
--- a/lapack-netlib/SRC/dsytrf_aa.f
+++ b/lapack-netlib/SRC/dsytrf_aa.f
@@ -101,8 +101,10 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >= MAX(1,2*N). For optimum performance
-*>          LWORK >= N*(1+NB), where NB is the optimal blocksize.
+*>          The length of WORK.
+*>          LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise.
+*>          For optimum performance LWORK >= N*(1+NB), where NB is
+*>          the optimal blocksize, returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -125,10 +127,10 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrf_aa
 *
 *  =====================================================================
-      SUBROUTINE DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO)
+      SUBROUTINE DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -152,7 +154,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY, UPPER
-      INTEGER            J, LWKOPT
+      INTEGER            J, LWKMIN, LWKOPT
       INTEGER            NB, MJ, NJ, K1, K2, J1, J2, J3, JB
       DOUBLE PRECISION   ALPHA
 *     ..
@@ -179,18 +181,25 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( N.LE.1 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = 2*N
+         LWKOPT = (NB+1)*N
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = (NB+1)*N
          WORK( 1 ) = LWKOPT
       END IF
 *
@@ -203,11 +212,11 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
           RETURN
       ENDIF
       IPIV( 1 ) = 1
-      IF ( N.EQ.1 ) THEN
+      IF( N.EQ.1 ) THEN
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/dsytrf_aa_2stage.f b/lapack-netlib/SRC/dsytrf_aa_2stage.f
index c65bd86e6..fae95bab2 100644
--- a/lapack-netlib/SRC/dsytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/dsytrf_aa_2stage.f
@@ -87,14 +87,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is DOUBLE PRECISION array, dimension (LTB)
+*>          TB is DOUBLE PRECISION array, dimension (MAX(1,LTB))
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -121,14 +121,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION workspace of size LWORK
+*>          WORK is DOUBLE PRECISION workspace of size (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used
+*>          to select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrf_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV,
@@ -211,9 +211,9 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -6
-      ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN
          INFO = -10
       END IF
 *
@@ -227,10 +227,10 @@
       NB = ILAENV( 1, 'DSYTRF_AA_2STAGE', UPLO, N, -1, -1, -1 )
       IF( INFO.EQ.0 ) THEN
          IF( TQUERY ) THEN
-            TB( 1 ) = (3*NB+1)*N
+            TB( 1 ) = MAX( 1, (3*NB+1)*N )
          END IF
          IF( WQUERY ) THEN
-            WORK( 1 ) = N*NB
+            WORK( 1 ) = MAX( 1, N*NB )
          END IF
       END IF
       IF( TQUERY .OR. WQUERY ) THEN
@@ -239,7 +239,7 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
          RETURN
       ENDIF
 *
diff --git a/lapack-netlib/SRC/dsytrf_rk.f b/lapack-netlib/SRC/dsytrf_rk.f
index 086586968..0717eb076 100644
--- a/lapack-netlib/SRC/dsytrf_rk.f
+++ b/lapack-netlib/SRC/dsytrf_rk.f
@@ -177,14 +177,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension ( MAX(1,LWORK) ).
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned
 *>          by ILAENV.
 *>
@@ -229,7 +229,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrf_rk
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/dsytrf_rook.f b/lapack-netlib/SRC/dsytrf_rook.f
index 2f00d1802..316663485 100644
--- a/lapack-netlib/SRC/dsytrf_rook.f
+++ b/lapack-netlib/SRC/dsytrf_rook.f
@@ -118,7 +118,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
@@ -146,7 +146,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrf_rook
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/dsytri2.f b/lapack-netlib/SRC/dsytri2.f
index dbcdcdb58..5960d3992 100644
--- a/lapack-netlib/SRC/dsytri2.f
+++ b/lapack-netlib/SRC/dsytri2.f
@@ -88,16 +88,16 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (N+NB+1)*(NB+3)
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          WORK is size >= (N+NB+1)*(NB+3)
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>           calculates:
+*>          calculates:
 *>              - the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array,
 *>              - and no error message related to LWORK is issued by XERBLA.
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetri2
 *
 *  =====================================================================
       SUBROUTINE DSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -159,9 +159,13 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+*
 *     Get blocksize
+*
       NBMAX = ILAENV( 1, 'DSYTRI2', UPLO, N, -1, -1, -1 )
-      IF ( NBMAX .GE. N ) THEN
+      IF( N.EQ.0 ) THEN
+         MINSIZE = 1
+      ELSE IF( NBMAX.GE.N ) THEN
          MINSIZE = N
       ELSE
          MINSIZE = (N+NBMAX+1)*(NBMAX+3)
@@ -173,28 +177,29 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
-*
-*     Quick return if possible
-*
 *
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DSYTRI2', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK(1)=MINSIZE
+         WORK( 1 ) = MINSIZE
          RETURN
       END IF
+*
+*     Quick return if possible
+*
       IF( N.EQ.0 )
      $   RETURN
 
-      IF( NBMAX .GE. N ) THEN
+      IF( NBMAX.GE.N ) THEN
          CALL DSYTRI( UPLO, N, A, LDA, IPIV, WORK, INFO )
       ELSE
          CALL DSYTRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO )
       END IF
+*
       RETURN
 *
 *     End of DSYTRI2
diff --git a/lapack-netlib/SRC/dsytri_3.f b/lapack-netlib/SRC/dsytri_3.f
index 86d69cdfd..50834c605 100644
--- a/lapack-netlib/SRC/dsytri_3.f
+++ b/lapack-netlib/SRC/dsytri_3.f
@@ -119,16 +119,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (N+NB+1)*(NB+3).
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK. LWORK >= (N+NB+1)*(NB+3).
+*>          The length of WORK.
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>
-*>          If LDWORK = -1, then a workspace query is assumed;
+*>          If LWORK = -1, then a workspace query is assumed;
 *>          the routine only calculates the optimal size of the optimal
 *>          size of the WORK array, returns this value as the first
 *>          entry of the WORK array, and no error message related to
@@ -152,7 +153,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetri_3
 *
 *> \par Contributors:
 *  ==================
@@ -208,8 +209,13 @@
 *
 *     Determine the block size
 *
-      NB = MAX( 1, ILAENV( 1, 'DSYTRI_3', UPLO, N, -1, -1, -1 ) )
-      LWKOPT = ( N+NB+1 ) * ( NB+3 )
+      IF( N.EQ.0 ) THEN
+         LWKOPT = 1
+      ELSE
+         NB = MAX( 1, ILAENV( 1, 'DSYTRI_3', UPLO, N, -1, -1, -1 ) )
+         LWKOPT = ( N+NB+1 ) * ( NB+3 )
+      END IF
+      WORK( 1 ) = LWKOPT
 *
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
@@ -217,7 +223,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN
          INFO = -8
       END IF
 *
@@ -225,7 +231,6 @@
          CALL XERBLA( 'DSYTRI_3', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = LWKOPT
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/dsytrs_aa.f b/lapack-netlib/SRC/dsytrs_aa.f
index 26b11a2a0..f0016cb7f 100644
--- a/lapack-netlib/SRC/dsytrs_aa.f
+++ b/lapack-netlib/SRC/dsytrs_aa.f
@@ -105,7 +105,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK >= max(1,3*N-2).
+*>          The dimension of the array WORK.
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the minimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -123,7 +129,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrs_aa
 *
 *  =====================================================================
       SUBROUTINE DSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB,
@@ -151,7 +157,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, UPPER
-      INTEGER            K, KP, LWKOPT
+      INTEGER            K, KP, LWKMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -161,13 +167,19 @@
       EXTERNAL           DLACPY, DGTSV, DSWAP, DTRSM, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX
+      INTRINSIC          MIN, MAX
 *     ..
 *     .. Executable Statements ..
 *
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWKMIN = 1
+      ELSE
+         LWKMIN = 3*N-2
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -178,21 +190,20 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DSYTRS_AA', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         LWKOPT = (3*N-2)
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = LWKMIN
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 .OR. NRHS.EQ.0 )
+      IF( MIN( N, NRHS ).EQ.0 )
      $   RETURN
 *
       IF( UPPER ) THEN
diff --git a/lapack-netlib/SRC/ilaenv.c b/lapack-netlib/SRC/ilaenv.c
index c47224a0c..8f3b2db8e 100644
--- a/lapack-netlib/SRC/ilaenv.c
+++ b/lapack-netlib/SRC/ilaenv.c
@@ -191,7 +191,7 @@ typedef struct Namelist Namelist;
 #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
 #ifdef _MSC_VER
 #define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
-#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
 #else
 #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
 #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
@@ -252,11 +252,11 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
 #define myexit_() break;
-#define mycycle() continue;
-#define myceiling(w) {ceil(w)}
-#define myhuge(w) {HUGE_VAL}
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
-#define mymaxloc(w,s,e,n) {dmaxloc_(w,*(s),*(e),n)}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
 
 /* procedure parameter types for -A and -C++ */
 
@@ -509,12 +509,18 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 
 
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
 
 /* Table of constant values */
 
 static integer c__1 = 1;
-static real c_b174 = 0.f;
-static real c_b175 = 1.f;
+static real c_b179 = 0.f;
+static real c_b180 = 1.f;
 static integer c__0 = 0;
 
 /* > \brief \b ILAENV */
@@ -599,9 +605,9 @@ f"> */
 /* >          = 9: maximum size of the subproblems at the bottom of the */
 /* >               computation tree in the divide-and-conquer algorithm */
 /* >               (used by xGELSD and xGESDD) */
-/* >          =10: ieee NaN arithmetic can be trusted not to trap */
+/* >          =10: ieee infinity and NaN arithmetic can be trusted not to trap */
 /* >          =11: infinity arithmetic can be trusted not to trap */
-/* >          12 <= ISPEC <= 16: */
+/* >          12 <= ISPEC <= 17: */
 /* >               xHSEQR or related subroutines, */
 /* >               see IPARMQ for detailed explanation */
 /* > \endverbatim */
@@ -652,9 +658,7 @@ f"> */
 /* > \author Univ. of Colorado Denver */
 /* > \author NAG Ltd. */
 
-/* > \date November 2019 */
-
-/* > \ingroup OTHERauxiliary */
+/* > \ingroup ilaenv */
 
 /* > \par Further Details: */
 /*  ===================== */
@@ -685,7 +689,7 @@ integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1,
 	opts_len)
 {
     /* System generated locals */
-    integer ret_val;
+    integer ret_val, i__1, i__2, i__3;
 
     /* Local variables */
     logical twostage;
@@ -702,10 +706,9 @@ integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1,
 	    integer *, integer *);
 
 
-/*  -- LAPACK auxiliary routine (version 3.9.0) -- */
+/*  -- LAPACK auxiliary routine -- */
 /*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
 /*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
-/*     November 2019 */
 
 
 /*  ===================================================================== */
@@ -728,6 +731,7 @@ integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1,
 	case 14:  goto L160;
 	case 15:  goto L160;
 	case 16:  goto L160;
+	case 17:  goto L160;
     }
 
 /*     Invalid value for ISPEC */
@@ -908,6 +912,12 @@ L50:
 	    } else {
 		nb = 64;
 	    }
+	} else if (s_cmp(subnam + 3, "QP3RK", (ftnlen)4, (ftnlen)5) == 0) {
+	    if (sname) {
+		nb = 32;
+	    } else {
+		nb = 32;
+	    }
 	}
     } else if (s_cmp(c2, "PO", (ftnlen)2, (ftnlen)2) == 0) {
 	if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) {
@@ -1034,6 +1044,21 @@ L50:
 	    } else {
 		nb = 64;
 	    }
+	} else if (s_cmp(c3, "SYL", (ftnlen)3, (ftnlen)3) == 0) {
+/*           The upper bound is to prevent overly aggressive scaling. */
+	    if (sname) {
+/* Computing MIN */
+/* Computing MAX */
+		i__2 = 48, i__3 = (f2cmin(*n1,*n2) << 4) / 100;
+		i__1 = f2cmax(i__2,i__3);
+		nb = f2cmin(i__1,240);
+	    } else {
+/* Computing MIN */
+/* Computing MAX */
+		i__2 = 24, i__3 = (f2cmin(*n1,*n2) << 3) / 100;
+		i__1 = f2cmax(i__2,i__3);
+		nb = f2cmin(i__1,80);
+	    }
 	}
     } else if (s_cmp(c2, "LA", (ftnlen)2, (ftnlen)2) == 0) {
 	if (s_cmp(c3, "UUM", (ftnlen)3, (ftnlen)3) == 0) {
@@ -1042,6 +1067,12 @@ L50:
 	    } else {
 		nb = 64;
 	    }
+	} else if (s_cmp(c3, "TRS", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 32;
+	    } else {
+		nb = 32;
+	    }
 	}
     } else if (sname && s_cmp(c2, "ST", (ftnlen)2, (ftnlen)2) == 0) {
 	if (s_cmp(c3, "EBZ", (ftnlen)3, (ftnlen)3) == 0) {
@@ -1093,6 +1124,12 @@ L60:
 	    } else {
 		nbmin = 2;
 	    }
+	} else if (s_cmp(subnam + 3, "QP3RK", (ftnlen)4, (ftnlen)5) == 0) {
+	    if (sname) {
+		nbmin = 2;
+	    } else {
+		nbmin = 2;
+	    }
 	}
     } else if (s_cmp(c2, "SY", (ftnlen)2, (ftnlen)2) == 0) {
 	if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) {
@@ -1184,6 +1221,12 @@ L70:
 	    } else {
 		nx = 128;
 	    }
+	} else if (s_cmp(subnam + 3, "QP3RK", (ftnlen)4, (ftnlen)5) == 0) {
+	    if (sname) {
+		nx = 128;
+	    } else {
+		nx = 128;
+	    }
 	}
     } else if (s_cmp(c2, "SY", (ftnlen)2, (ftnlen)2) == 0) {
 	if (sname && s_cmp(c3, "TRD", (ftnlen)3, (ftnlen)3) == 0) {
@@ -1270,29 +1313,29 @@ L130:
 
 L140:
 
-/*     ISPEC = 10: ieee NaN arithmetic can be trusted not to trap */
+/*     ISPEC = 10: ieee and infinity NaN arithmetic can be trusted not to trap */
 
 /*     ILAENV = 0 */
     ret_val = 1;
     if (ret_val == 1) {
-	ret_val = ieeeck_(&c__1, &c_b174, &c_b175);
+	ret_val = ieeeck_(&c__1, &c_b179, &c_b180);
     }
     return ret_val;
 
 L150:
 
-/*     ISPEC = 11: infinity arithmetic can be trusted not to trap */
+/*     ISPEC = 11: ieee infinity arithmetic can be trusted not to trap */
 
 /*     ILAENV = 0 */
     ret_val = 1;
     if (ret_val == 1) {
-	ret_val = ieeeck_(&c__0, &c_b174, &c_b175);
+	ret_val = ieeeck_(&c__0, &c_b179, &c_b180);
     }
     return ret_val;
 
 L160:
 
-/*     12 <= ISPEC <= 16: xHSEQR or related subroutines. */
+/*     12 <= ISPEC <= 17: xHSEQR or related subroutines. */
 
     ret_val = iparmq_(ispec, name__, opts, n1, n2, n3, n4)
 	    ;
diff --git a/lapack-netlib/SRC/ilaenv.f b/lapack-netlib/SRC/ilaenv.f
index a639e0375..e74a2b35e 100644
--- a/lapack-netlib/SRC/ilaenv.f
+++ b/lapack-netlib/SRC/ilaenv.f
@@ -132,7 +132,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup OTHERauxiliary
+*> \ingroup ilaenv
 *
 *> \par Further Details:
 *  =====================
@@ -355,6 +355,12 @@
             ELSE
                NB = 64
             END IF
+         ELSE IF( SUBNAM( 4: 7 ).EQ.'QP3RK' ) THEN
+            IF( SNAME ) THEN
+               NB = 32
+            ELSE
+               NB = 32
+            END IF
          END IF
       ELSE IF( C2.EQ.'PO' ) THEN
          IF( C3.EQ.'TRF' ) THEN
@@ -541,7 +547,14 @@
             ELSE
                NBMIN = 2
             END IF
+         ELSE IF( SUBNAM( 4: 7 ).EQ.'QP3RK' ) THEN
+            IF( SNAME ) THEN
+               NBMIN = 2
+            ELSE
+               NBMIN = 2
+            END IF
          END IF
+
       ELSE IF( C2.EQ.'SY' ) THEN
          IF( C3.EQ.'TRF' ) THEN
             IF( SNAME ) THEN
@@ -618,6 +631,12 @@
             ELSE
                NX = 128
             END IF
+         ELSE IF( SUBNAM( 4: 7 ).EQ.'QP3RK' ) THEN
+            IF( SNAME ) THEN
+               NX = 128
+            ELSE
+               NX = 128
+            END IF
          END IF
       ELSE IF( C2.EQ.'SY' ) THEN
          IF( SNAME .AND. C3.EQ.'TRD' ) THEN
diff --git a/lapack-netlib/SRC/sbbcsd.f b/lapack-netlib/SRC/sbbcsd.f
index d5720cb33..2a619cb71 100644
--- a/lapack-netlib/SRC/sbbcsd.f
+++ b/lapack-netlib/SRC/sbbcsd.f
@@ -805,7 +805,7 @@
                CALL SLARTGP( B22BULGE, B22E(I-1), WORK(IU2SN+I-1),
      $                       WORK(IU2CS+I-1), R )
             ELSE IF( NU .LT. MU ) THEN
-               CALL SLARTGS( B21E(I), B21E(I+1), NU, WORK(IU2CS+I-1),
+               CALL SLARTGS( B21E(I), B21D(I+1), NU, WORK(IU2CS+I-1),
      $                       WORK(IU2SN+I-1) )
             ELSE
                CALL SLARTGS( B22D(I), B22E(I), MU, WORK(IU2CS+I-1),
diff --git a/lapack-netlib/SRC/sgebrd.f b/lapack-netlib/SRC/sgebrd.f
index 2d0c6d651..b33ad0b1f 100644
--- a/lapack-netlib/SRC/sgebrd.f
+++ b/lapack-netlib/SRC/sgebrd.f
@@ -122,7 +122,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= max(1,M,N).
+*>          The length of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise.
 *>          For optimum performance LWORK >= (M+N)*NB, where NB
 *>          is the optimal blocksize.
 *>
@@ -223,8 +224,8 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB,
-     $                   NBMIN, NX, WS
+      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT,
+     $                   MINMN, NB, NBMIN, NX, WS
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEBD2, SGEMM, SLABRD, XERBLA
@@ -242,9 +243,16 @@
 *     Test the input parameters
 *
       INFO = 0
-      NB = MAX( 1, ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) )
-      LWKOPT = ( M+N )*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = MAX( M, N )
+         NB = MAX( 1, ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) )
+         LWKOPT = ( M+N )*NB
+      ENDIF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -252,7 +260,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.LT.0 ) THEN
@@ -264,7 +272,6 @@
 *
 *     Quick return if possible
 *
-      MINMN = MIN( M, N )
       IF( MINMN.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -283,7 +290,7 @@
 *        Determine when to switch from blocked to unblocked code.
 *
          IF( NX.LT.MINMN ) THEN
-            WS = ( M+N )*NB
+            WS = LWKOPT
             IF( LWORK.LT.WS ) THEN
 *
 *              Not enough work space for the optimal NB, consider using
@@ -342,7 +349,8 @@
 *
       CALL SGEBD2( M-I+1, N-I+1, A( I, I ), LDA, D( I ), E( I ),
      $             TAUQ( I ), TAUP( I ), WORK, IINFO )
-      WORK( 1 ) = SROUNDUP_LWORK(WS)
+*
+      WORK( 1 ) = SROUNDUP_LWORK( WS )
       RETURN
 *
 *     End of SGEBRD
diff --git a/lapack-netlib/SRC/sgedmd.f90 b/lapack-netlib/SRC/sgedmd.f90
index 49cb11527..4860e8898 100644
--- a/lapack-netlib/SRC/sgedmd.f90
+++ b/lapack-netlib/SRC/sgedmd.f90
@@ -1,423 +1,573 @@
-      SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
-                         M, N, X, LDX, Y, LDY, NRNK, TOL,  &
-                         K, REIG,  IMEIG,   Z, LDZ,  RES,  &
-                         B, LDB, W,  LDW,   S, LDS,        &
-                         WORK, LWORK, IWORK, LIWORK, INFO )
-! March 2023
+!> \brief \b SGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices.
+!
+!  =========== DOCUMENTATION ===========
+!
+!  Definition:
+!  ===========
+!
+!     SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
+!                        M, N, X, LDX, Y, LDY, NRNK, TOL,  &
+!                        K, REIG,  IMEIG,   Z, LDZ,  RES,  &
+!                        B, LDB, W,  LDW,   S, LDS,        &
+!                        WORK, LWORK, IWORK, LIWORK, INFO )
 !.....
-      USE                   iso_fortran_env
-      IMPLICIT NONE
-      INTEGER, PARAMETER :: WP = real32
+!     USE                   iso_fortran_env
+!     IMPLICIT NONE
+!     INTEGER, PARAMETER :: WP = real32
 !.....
 !     Scalar arguments
-      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
-      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
-                                 NRNK, LDZ, LDB, LDW,  LDS, &
-                                 LWORK,  LIWORK
-      INTEGER,   INTENT(OUT)  :: K, INFO
-      REAL(KIND=WP), INTENT(IN) ::  TOL
+!     CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+!     INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+!                                NRNK, LDZ, LDB, LDW,  LDS, &
+!                                LWORK,  LIWORK
+!     INTEGER,   INTENT(OUT)  :: K, INFO
+!     REAL(KIND=WP), INTENT(IN) ::  TOL
 !     Array arguments
-      REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
-      REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
-                                      W(LDW,*), S(LDS,*)
-      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
-                                      RES(*)
-      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
-      INTEGER,       INTENT(OUT)   :: IWORK(*)
-!............................................................
-!     Purpose
-!     =======
-!     SGEDMD computes the Dynamic Mode Decomposition (DMD) for
-!     a pair of data snapshot matrices. For the input matrices
-!     X and Y such that Y = A*X with an unaccessible matrix
-!     A, SGEDMD computes a certain number of Ritz pairs of A using
-!     the standard Rayleigh-Ritz extraction from a subspace of
-!     range(X) that is determined using the leading left singular
-!     vectors of X. Optionally, SGEDMD returns the residuals
-!     of the computed Ritz pairs, the information needed for
-!     a refinement of the Ritz vectors, or the eigenvectors of
-!     the Exact DMD.
-!     For further details see the references listed
-!     below. For more details of the implementation see [3].
-!
-!     References
-!     ==========
-!     [1] P. Schmid: Dynamic mode decomposition of numerical
-!         and experimental data,
-!         Journal of Fluid Mechanics 656, 5-28, 2010.
-!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
-!         decompositions: analysis and enhancements,
-!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
-!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
-!         Mode Decomposition I. Technical report. AIMDyn Inc.
-!         and LAPACK Working Note 298.
-!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
-!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
-!         Theory and Applications, Journal of Computational
-!         Dynamics 1(2), 391 -421, 2014.
+!     REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+!     REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+!                                     W(LDW,*), S(LDS,*)
+!     REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
+!                                     RES(*)
+!     REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
+!     INTEGER,       INTENT(OUT)   :: IWORK(*)
 !
+!............................................................
+!>    \par Purpose:
+!     =============
+!>    \verbatim
+!>    SGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!>    a pair of data snapshot matrices. For the input matrices
+!>    X and Y such that Y = A*X with an unaccessible matrix
+!>    A, SGEDMD computes a certain number of Ritz pairs of A using
+!>    the standard Rayleigh-Ritz extraction from a subspace of
+!>    range(X) that is determined using the leading left singular
+!>    vectors of X. Optionally, SGEDMD returns the residuals
+!>    of the computed Ritz pairs, the information needed for
+!>    a refinement of the Ritz vectors, or the eigenvectors of
+!>    the Exact DMD.
+!>    For further details see the references listed
+!>    below. For more details of the implementation see [3].
+!>    \endverbatim
+!............................................................
+!>    \par References:
+!     ================
+!>    \verbatim
+!>    [1] P. Schmid: Dynamic mode decomposition of numerical
+!>        and experimental data,
+!>        Journal of Fluid Mechanics 656, 5-28, 2010.
+!>    [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!>        decompositions: analysis and enhancements,
+!>        SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!>    [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!>        Mode Decomposition I. Technical report. AIMDyn Inc.
+!>        and LAPACK Working Note 298.
+!>    [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!>        Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!>        Theory and Applications, Journal of Computational
+!>        Dynamics 1(2), 391 -421, 2014.
+!>    \endverbatim
 !......................................................................
-!     Developed and supported by:
-!     ===========================
-!     Developed and coded by Zlatko Drmac, Faculty of Science,
-!     University of Zagreb;  drmac@math.hr
-!     In cooperation with
-!     AIMdyn Inc., Santa Barbara, CA.
-!     and supported by
-!     - DARPA SBIR project "Koopman Operator-Based Forecasting
-!     for Nonstationary Processes from Near-Term, Limited
-!     Observational Data" Contract No: W31P4Q-21-C-0007
-!     - DARPA PAI project "Physics-Informed Machine Learning
-!     Methodologies" Contract No: HR0011-18-9-0033
-!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
-!     Framework for Space-Time Analysis of Process Dynamics"
-!     Contract No: HR0011-16-C-0116
-!     Any opinions, findings and conclusions or recommendations
-!     expressed in this material are those of the author and
-!     do not necessarily reflect the views of the DARPA SBIR
-!     Program Office
-!============================================================
-!     Distribution Statement A:
-!     Approved for Public Release, Distribution Unlimited.
-!     Cleared by DARPA on September 29, 2022
-!============================================================
+!>    \par Developed and supported by:
+!     ================================
+!>    \verbatim
+!>    Developed and coded by Zlatko Drmac, Faculty of Science,
+!>    University of Zagreb;  drmac@math.hr
+!>    In cooperation with
+!>    AIMdyn Inc., Santa Barbara, CA.
+!>    and supported by
+!>    - DARPA SBIR project "Koopman Operator-Based Forecasting
+!>    for Nonstationary Processes from Near-Term, Limited
+!>    Observational Data" Contract No: W31P4Q-21-C-0007
+!>    - DARPA PAI project "Physics-Informed Machine Learning
+!>    Methodologies" Contract No: HR0011-18-9-0033
+!>    - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!>    Framework for Space-Time Analysis of Process Dynamics"
+!>    Contract No: HR0011-16-C-0116
+!>    Any opinions, findings and conclusions or recommendations
+!>    expressed in this material are those of the author and
+!>    do not necessarily reflect the views of the DARPA SBIR
+!>    Program Office
+!>    \endverbatim
 !......................................................................
+!>    \par Distribution Statement A:
+!     ==============================
+!>    \verbatim
+!>    Distribution Statement A:
+!>    Approved for Public Release, Distribution Unlimited.
+!>    Cleared by DARPA on September 29, 2022
+!>    \endverbatim
+!============================================================
 !     Arguments
 !     =========
-!     JOBS (input) CHARACTER*1
-!     Determines whether the initial data snapshots are scaled
-!     by a diagonal matrix.
-!     'S' :: The data snapshots matrices X and Y are multiplied
-!            with a diagonal matrix D so that X*D has unit
-!            nonzero columns (in the Euclidean 2-norm)
-!     'C' :: The snapshots are scaled as with the 'S' option.
-!            If it is found that an i-th column of X is zero
-!            vector and the corresponding i-th column of Y is
-!            non-zero, then the i-th column of Y is set to
-!            zero and a warning flag is raised.
-!     'Y' :: The data snapshots matrices X and Y are multiplied
-!            by a diagonal matrix D so that Y*D has unit
-!            nonzero columns (in the Euclidean 2-norm)
-!     'N' :: No data scaling.
+!
+!>    \param[in] JOBS
+!>    \verbatim
+!>    JOBS (input) CHARACTER*1
+!>    Determines whether the initial data snapshots are scaled
+!>    by a diagonal matrix.
+!>    'S' :: The data snapshots matrices X and Y are multiplied
+!>           with a diagonal matrix D so that X*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'C' :: The snapshots are scaled as with the 'S' option.
+!>           If it is found that an i-th column of X is zero
+!>           vector and the corresponding i-th column of Y is
+!>           non-zero, then the i-th column of Y is set to
+!>           zero and a warning flag is raised.
+!>    'Y' :: The data snapshots matrices X and Y are multiplied
+!>           by a diagonal matrix D so that Y*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'N' :: No data scaling.
+!>    \endverbatim
 !.....
-!     JOBZ (input) CHARACTER*1
-!     Determines whether the eigenvectors (Koopman modes) will
-!     be computed.
-!     'V' :: The eigenvectors (Koopman modes) will be computed
-!            and returned in the matrix Z.
-!            See the description of Z.
-!     'F' :: The eigenvectors (Koopman modes) will be returned
-!            in factored form as the product X(:,1:K)*W, where X
-!            contains a POD basis (leading left singular vectors
-!            of the data matrix X) and W contains the eigenvectors
-!            of the corresponding Rayleigh quotient.
-!            See the descriptions of K, X, W, Z.
-!     'N' :: The eigenvectors are not computed.
+!>    \param[in] JOBZ
+!>    \verbatim
+!>    JOBZ (input) CHARACTER*1
+!>    Determines whether the eigenvectors (Koopman modes) will
+!>    be computed.
+!>    'V' :: The eigenvectors (Koopman modes) will be computed
+!>           and returned in the matrix Z.
+!>           See the description of Z.
+!>    'F' :: The eigenvectors (Koopman modes) will be returned
+!>           in factored form as the product X(:,1:K)*W, where X
+!>           contains a POD basis (leading left singular vectors
+!>           of the data matrix X) and W contains the eigenvectors
+!>           of the corresponding Rayleigh quotient.
+!>           See the descriptions of K, X, W, Z.
+!>    'N' :: The eigenvectors are not computed.
+!>    \endverbatim
 !.....
-!     JOBR (input) CHARACTER*1
-!     Determines whether to compute the residuals.
-!     'R' :: The residuals for the computed eigenpairs will be
-!            computed and stored in the array RES.
-!            See the description of RES.
-!            For this option to be legal, JOBZ must be 'V'.
-!     'N' :: The residuals are not computed.
+!>    \param[in] JOBR
+!>    \verbatim
+!>    JOBR (input) CHARACTER*1
+!>    Determines whether to compute the residuals.
+!>    'R' :: The residuals for the computed eigenpairs will be
+!>           computed and stored in the array RES.
+!>           See the description of RES.
+!>           For this option to be legal, JOBZ must be 'V'.
+!>    'N' :: The residuals are not computed.
+!>    \endverbatim
 !.....
-!     JOBF (input) CHARACTER*1
-!     Specifies whether to store information needed for post-
-!     processing (e.g. computing refined Ritz vectors)
-!     'R' :: The matrix needed for the refinement of the Ritz
-!            vectors is computed and stored in the array B.
-!            See the description of B.
-!     'E' :: The unscaled eigenvectors of the Exact DMD are
-!            computed and returned in the array B. See the
-!            description of B.
-!     'N' :: No eigenvector refinement data is computed.
+!>    \param[in] JOBF
+!>    \verbatim
+!>    JOBF (input) CHARACTER*1
+!>    Specifies whether to store information needed for post-
+!>    processing (e.g. computing refined Ritz vectors)
+!>    'R' :: The matrix needed for the refinement of the Ritz
+!>           vectors is computed and stored in the array B.
+!>           See the description of B.
+!>    'E' :: The unscaled eigenvectors of the Exact DMD are
+!>           computed and returned in the array B. See the
+!>           description of B.
+!>    'N' :: No eigenvector refinement data is computed.
+!>    \endverbatim
 !.....
-!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
-!     Allows for a selection of the SVD algorithm from the
-!     LAPACK library.
-!     1 :: SGESVD (the QR SVD algorithm)
-!     2 :: SGESDD (the Divide and Conquer algorithm; if enough
-!          workspace available, this is the fastest option)
-!     3 :: SGESVDQ (the preconditioned QR SVD  ; this and 4
-!          are the most accurate options)
-!     4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3
-!          are the most accurate options)
-!     For the four methods above, a significant difference in
-!     the accuracy of small singular values is possible if
-!     the snapshots vary in norm so that X is severely
-!     ill-conditioned. If small (smaller than EPS*||X||)
-!     singular values are of interest and JOBS=='N',  then
-!     the options (3, 4) give the most accurate results, where
-!     the option 4 is slightly better and with stronger
-!     theoretical background.
-!     If JOBS=='S', i.e. the columns of X will be normalized,
-!     then all methods give nearly equally accurate results.
+!>    \param[in] WHTSVD
+!>    \verbatim
+!>    WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!>    Allows for a selection of the SVD algorithm from the
+!>    LAPACK library.
+!>    1 :: SGESVD (the QR SVD algorithm)
+!>    2 :: SGESDD (the Divide and Conquer algorithm; if enough
+!>         workspace available, this is the fastest option)
+!>    3 :: SGESVDQ (the preconditioned QR SVD  ; this and 4
+!>         are the most accurate options)
+!>    4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3
+!>         are the most accurate options)
+!>    For the four methods above, a significant difference in
+!>    the accuracy of small singular values is possible if
+!>    the snapshots vary in norm so that X is severely
+!>    ill-conditioned. If small (smaller than EPS*||X||)
+!>    singular values are of interest and JOBS=='N',  then
+!>    the options (3, 4) give the most accurate results, where
+!>    the option 4 is slightly better and with stronger
+!>    theoretical background.
+!>    If JOBS=='S', i.e. the columns of X will be normalized,
+!>    then all methods give nearly equally accurate results.
+!>    \endverbatim
 !.....
-!     M (input) INTEGER, M>= 0
-!     The state space dimension (the row dimension of X, Y).
+!>    \param[in] M
+!>    \verbatim
+!>    M (input) INTEGER, M>= 0
+!>    The state space dimension (the row dimension of X, Y).
+!>    \endverbatim
 !.....
-!     N (input) INTEGER, 0 <= N <= M
-!     The number of data snapshot pairs
-!     (the number of columns of X and Y).
+!>    \param[in] N
+!>    \verbatim
+!>    N (input) INTEGER, 0 <= N <= M
+!>    The number of data snapshot pairs
+!>    (the number of columns of X and Y).
+!>    \endverbatim
 !.....
-!     X (input/output) REAL(KIND=WP) M-by-N array
-!     > On entry, X contains the data snapshot matrix X. It is
-!     assumed that the column norms of X are in the range of
-!     the normalized floating point numbers.
-!     < On exit, the leading K columns of X contain a POD basis,
-!     i.e. the leading K left singular vectors of the input
-!     data matrix X, U(:,1:K). All N columns of X contain all
-!     left singular vectors of the input matrix X.
-!     See the descriptions of K, Z and W.
+!>    \param[in,out] X
+!>    \verbatim
+!>    X (input/output) REAL(KIND=WP) M-by-N array
+!>    > On entry, X contains the data snapshot matrix X. It is
+!>    assumed that the column norms of X are in the range of
+!>    the normalized floating point numbers.
+!>    < On exit, the leading K columns of X contain a POD basis,
+!>    i.e. the leading K left singular vectors of the input
+!>    data matrix X, U(:,1:K). All N columns of X contain all
+!>    left singular vectors of the input matrix X.
+!>    See the descriptions of K, Z and W.
+!>    \endverbatim
 !.....
-!     LDX (input) INTEGER, LDX >= M
-!     The leading dimension of the array X.
+!>    \param[in] LDX
+!>    \verbatim
+!>    LDX (input) INTEGER, LDX >= M
+!>    The leading dimension of the array X.
+!>    \endverbatim
 !.....
-!     Y (input/workspace/output) REAL(KIND=WP) M-by-N array
-!     > On entry, Y contains the data snapshot matrix Y
-!     < On exit,
-!     If JOBR == 'R', the leading K columns of Y  contain
-!     the residual vectors for the computed Ritz pairs.
-!     See the description of RES.
-!     If JOBR == 'N', Y contains the original input data,
-!                     scaled according to the value of JOBS.
+!>    \param[in,out] Y
+!>    \verbatim
+!>    Y (input/workspace/output) REAL(KIND=WP) M-by-N array
+!>    > On entry, Y contains the data snapshot matrix Y
+!>    < On exit,
+!>    If JOBR == 'R', the leading K columns of Y  contain
+!>    the residual vectors for the computed Ritz pairs.
+!>    See the description of RES.
+!>    If JOBR == 'N', Y contains the original input data,
+!>                    scaled according to the value of JOBS.
+!>    \endverbatim
 !.....
-!     LDY (input) INTEGER , LDY >= M
-!     The leading dimension of the array Y.
+!>    \param[in] LDY
+!>    \verbatim
+!>    LDY (input) INTEGER , LDY >= M
+!>    The leading dimension of the array Y.
+!>    \endverbatim
 !.....
-!     NRNK (input) INTEGER
-!     Determines the mode how to compute the numerical rank,
-!     i.e. how to truncate small singular values of the input
-!     matrix X. On input, if
-!     NRNK = -1 :: i-th singular value sigma(i) is truncated
-!                  if sigma(i) <= TOL*sigma(1)
-!                  This option is recommended.
-!     NRNK = -2 :: i-th singular value sigma(i) is truncated
-!                  if sigma(i) <= TOL*sigma(i-1)
-!                  This option is included for R&D purposes.
-!                  It requires highly accurate SVD, which
-!                  may not be feasible.
-!     The numerical rank can be enforced by using positive
-!     value of NRNK as follows:
-!     0 < NRNK <= N :: at most NRNK largest singular values
-!     will be used. If the number of the computed nonzero
-!     singular values is less than NRNK, then only those
-!     nonzero values will be used and the actually used
-!     dimension is less than NRNK. The actual number of
-!     the nonzero singular values is returned in the variable
-!     K. See the descriptions of TOL and  K.
+!>    \param[in] NRNK
+!>    \verbatim
+!>    NRNK (input) INTEGER
+!>    Determines the mode how to compute the numerical rank,
+!>    i.e. how to truncate small singular values of the input
+!>    matrix X. On input, if
+!>    NRNK = -1 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(1)
+!>                 This option is recommended.
+!>    NRNK = -2 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(i-1)
+!>                 This option is included for R&D purposes.
+!>                 It requires highly accurate SVD, which
+!>                 may not be feasible.
+!>    The numerical rank can be enforced by using positive
+!>    value of NRNK as follows:
+!>    0 < NRNK <= N :: at most NRNK largest singular values
+!>    will be used. If the number of the computed nonzero
+!>    singular values is less than NRNK, then only those
+!>    nonzero values will be used and the actually used
+!>    dimension is less than NRNK. The actual number of
+!>    the nonzero singular values is returned in the variable
+!>    K. See the descriptions of TOL and  K.
+!>    \endverbatim
 !.....
-!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
-!     The tolerance for truncating small singular values.
-!     See the description of NRNK.
+!>    \param[in] TOL
+!>    \verbatim
+!>    TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!>    The tolerance for truncating small singular values.
+!>    See the description of NRNK.
+!>    \endverbatim
 !.....
-!     K (output) INTEGER,  0 <= K <= N
-!     The dimension of the POD basis for the data snapshot
-!     matrix X and the number of the computed Ritz pairs.
-!     The value of K is determined according to the rule set
-!     by the parameters NRNK and TOL.
-!     See the descriptions of NRNK and TOL.
+!>    \param[out] K
+!>    \verbatim
+!>    K (output) INTEGER,  0 <= K <= N
+!>    The dimension of the POD basis for the data snapshot
+!>    matrix X and the number of the computed Ritz pairs.
+!>    The value of K is determined according to the rule set
+!>    by the parameters NRNK and TOL.
+!>    See the descriptions of NRNK and TOL.
+!>    \endverbatim
 !.....
-!     REIG (output) REAL(KIND=WP) N-by-1 array
-!     The leading K (K<=N) entries of REIG contain
-!     the real parts of the computed eigenvalues
-!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
-!     See the descriptions of K, IMEIG, and Z.
+!>    \param[out] REIG
+!>    \verbatim
+!>    REIG (output) REAL(KIND=WP) N-by-1 array
+!>    The leading K (K<=N) entries of REIG contain
+!>    the real parts of the computed eigenvalues
+!>    REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!>    See the descriptions of K, IMEIG, and Z.
+!>    \endverbatim
 !.....
-!     IMEIG (output) REAL(KIND=WP) N-by-1 array
-!     The leading K (K<=N) entries of IMEIG contain
-!     the imaginary parts of the computed eigenvalues
-!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
-!     The eigenvalues are determined as follows:
-!     If IMEIG(i) == 0, then the corresponding eigenvalue is
-!     real, LAMBDA(i) = REIG(i).
-!     If IMEIG(i)>0, then the corresponding complex
-!     conjugate pair of eigenvalues reads
-!     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
-!     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
-!     That is, complex conjugate pairs have consecutive
-!     indices (i,i+1), with the positive imaginary part
-!     listed first.
-!     See the descriptions of K, REIG, and Z.
+!>    \param[out] IMEIG
+!>    \verbatim
+!>    IMEIG (output) REAL(KIND=WP) N-by-1 array
+!>    The leading K (K<=N) entries of IMEIG contain
+!>    the imaginary parts of the computed eigenvalues
+!>    REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!>    The eigenvalues are determined as follows:
+!>    If IMEIG(i) == 0, then the corresponding eigenvalue is
+!>    real, LAMBDA(i) = REIG(i).
+!>    If IMEIG(i)>0, then the corresponding complex
+!>    conjugate pair of eigenvalues reads
+!>    LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
+!>    LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
+!>    That is, complex conjugate pairs have consecutive
+!>    indices (i,i+1), with the positive imaginary part
+!>    listed first.
+!>    See the descriptions of K, REIG, and Z.
+!>    \endverbatim
 !.....
-!     Z (workspace/output) REAL(KIND=WP)  M-by-N array
-!     If JOBZ =='V' then
-!        Z contains real Ritz vectors as follows:
-!        If IMEIG(i)=0, then Z(:,i) is an eigenvector of
-!        the i-th Ritz value; ||Z(:,i)||_2=1.
-!        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
-!        [Z(:,i) Z(:,i+1)] span an invariant subspace and
-!        the Ritz values extracted from this subspace are
-!        REIG(i) + sqrt(-1)*IMEIG(i) and
-!        REIG(i) - sqrt(-1)*IMEIG(i).
-!        The corresponding eigenvectors are
-!        Z(:,i) + sqrt(-1)*Z(:,i+1) and
-!        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
-!        || Z(:,i:i+1)||_F = 1.
-!     If JOBZ == 'F', then the above descriptions hold for
-!     the columns of X(:,1:K)*W(1:K,1:K), where the columns
-!     of W(1:k,1:K) are the computed eigenvectors of the
-!     K-by-K Rayleigh quotient. The columns of W(1:K,1:K)
-!     are similarly structured: If IMEIG(i) == 0 then
-!     X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0
-!     then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and
-!          X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1)
-!     are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
-!     See the descriptions of REIG, IMEIG, X and W.
+!>    \param[out] Z
+!>    \verbatim
+!>    Z (workspace/output) REAL(KIND=WP)  M-by-N array
+!>    If JOBZ =='V' then
+!>       Z contains real Ritz vectors as follows:
+!>       If IMEIG(i)=0, then Z(:,i) is an eigenvector of
+!>       the i-th Ritz value; ||Z(:,i)||_2=1.
+!>       If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
+!>       [Z(:,i) Z(:,i+1)] span an invariant subspace and
+!>       the Ritz values extracted from this subspace are
+!>       REIG(i) + sqrt(-1)*IMEIG(i) and
+!>       REIG(i) - sqrt(-1)*IMEIG(i).
+!>       The corresponding eigenvectors are
+!>       Z(:,i) + sqrt(-1)*Z(:,i+1) and
+!>       Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
+!>       || Z(:,i:i+1)||_F = 1.
+!>    If JOBZ == 'F', then the above descriptions hold for
+!>    the columns of X(:,1:K)*W(1:K,1:K), where the columns
+!>    of W(1:k,1:K) are the computed eigenvectors of the
+!>    K-by-K Rayleigh quotient. The columns of W(1:K,1:K)
+!>    are similarly structured: If IMEIG(i) == 0 then
+!>    X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0
+!>    then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and
+!>         X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1)
+!>    are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
+!>    See the descriptions of REIG, IMEIG, X and W.
+!>    \endverbatim
 !.....
-!     LDZ (input) INTEGER , LDZ >= M
-!     The leading dimension of the array Z.
+!>    \param[in] LDZ
+!>    \verbatim
+!>    LDZ (input) INTEGER , LDZ >= M
+!>    The leading dimension of the array Z.
+!>    \endverbatim
 !.....
-!     RES (output) REAL(KIND=WP) N-by-1 array
-!     RES(1:K) contains the residuals for the K computed
-!     Ritz pairs.
-!     If LAMBDA(i) is real, then
-!        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
-!     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
-!     then
-!     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
-!     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
-!               [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
-!     It holds that
-!     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
-!     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
-!     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
-!           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
-!     See the description of REIG, IMEIG and Z.
+!>    \param[out] RES
+!>    \verbatim
+!>    RES (output) REAL(KIND=WP) N-by-1 array
+!>    RES(1:K) contains the residuals for the K computed
+!>    Ritz pairs.
+!>    If LAMBDA(i) is real, then
+!>       RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
+!>    If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
+!>    then
+!>    RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
+!>    where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
+!>              [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
+!>    It holds that
+!>    RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
+!>    RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
+!>    where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
+!>          ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
+!>    See the description of REIG, IMEIG and Z.
+!>    \endverbatim
 !.....
-!     B (output) REAL(KIND=WP)  M-by-N array.
-!     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
-!     be used for computing the refined vectors; see further
-!     details in the provided references.
-!     If JOBF == 'E', B(1:M,1;K) contains
-!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
-!     Exact DMD, up to scaling by the inverse eigenvalues.
-!     If JOBF =='N', then B is not referenced.
-!     See the descriptions of X, W, K.
+!>    \param[out] B
+!>    \verbatim
+!>    B (output) REAL(KIND=WP)  M-by-N array.
+!>    IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!>    be used for computing the refined vectors; see further
+!>    details in the provided references.
+!>    If JOBF == 'E', B(1:M,1;K) contains
+!>    A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!>    Exact DMD, up to scaling by the inverse eigenvalues.
+!>    If JOBF =='N', then B is not referenced.
+!>    See the descriptions of X, W, K.
+!>    \endverbatim
 !.....
-!     LDB (input) INTEGER, LDB >= M
-!     The leading dimension of the array B.
+!>    \param[in] LDB
+!>    \verbatim
+!>    LDB (input) INTEGER, LDB >= M
+!>    The leading dimension of the array B.
+!>    \endverbatim
 !.....
-!     W (workspace/output) REAL(KIND=WP) N-by-N array
-!     On exit, W(1:K,1:K) contains the K computed
-!     eigenvectors of the matrix Rayleigh quotient (real and
-!     imaginary parts for each complex conjugate pair of the
-!     eigenvalues). The Ritz vectors (returned in Z) are the
-!     product of X (containing a POD basis for the input
-!     matrix X) and W. See the descriptions of K, S, X and Z.
-!     W is also used as a workspace to temporarily store the
-!     left singular vectors of X.
+!>    \param[out] W
+!>    \verbatim
+!>    W (workspace/output) REAL(KIND=WP) N-by-N array
+!>    On exit, W(1:K,1:K) contains the K computed
+!>    eigenvectors of the matrix Rayleigh quotient (real and
+!>    imaginary parts for each complex conjugate pair of the
+!>    eigenvalues). The Ritz vectors (returned in Z) are the
+!>    product of X (containing a POD basis for the input
+!>    matrix X) and W. See the descriptions of K, S, X and Z.
+!>    W is also used as a workspace to temporarily store the
+!>    left singular vectors of X.
+!>    \endverbatim
 !.....
-!     LDW (input) INTEGER, LDW >= N
-!     The leading dimension of the array W.
+!>    \param[in] LDW
+!>    \verbatim
+!>    LDW (input) INTEGER, LDW >= N
+!>    The leading dimension of the array W.
+!>    \endverbatim
 !.....
-!     S (workspace/output) REAL(KIND=WP) N-by-N array
-!     The array S(1:K,1:K) is used for the matrix Rayleigh
-!     quotient. This content is overwritten during
-!     the eigenvalue decomposition by SGEEV.
-!     See the description of K.
+!>    \param[out] S
+!>    \verbatim
+!>    S (workspace/output) REAL(KIND=WP) N-by-N array
+!>    The array S(1:K,1:K) is used for the matrix Rayleigh
+!>    quotient. This content is overwritten during
+!>    the eigenvalue decomposition by SGEEV.
+!>    See the description of K.
+!>    \endverbatim
 !.....
-!     LDS (input) INTEGER, LDS >= N
-!     The leading dimension of the array S.
+!>    \param[in] LDS
+!>    \verbatim
+!>    LDS (input) INTEGER, LDS >= N
+!>    The leading dimension of the array S.
+!>    \endverbatim
 !.....
-!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
-!     On exit, WORK(1:N) contains the singular values of
-!     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
-!     If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain
-!     scaling factor WORK(N+2)/WORK(N+1) used to scale X
-!     and Y to avoid overflow in the SVD of X.
-!     This may be of interest if the scaling option is off
-!     and as many as possible smallest eigenvalues are
-!     desired to the highest feasible accuracy.
-!     If the call to SGEDMD is only workspace query, then
-!     WORK(1) contains the minimal workspace length and
-!     WORK(2) is the optimal workspace length. Hence, the
-!     length of work is at least 2.
-!     See the description of LWORK.
+!>    \param[out] WORK
+!>    \verbatim
+!>    WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!>    On exit, WORK(1:N) contains the singular values of
+!>    X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!>    If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain
+!>    scaling factor WORK(N+2)/WORK(N+1) used to scale X
+!>    and Y to avoid overflow in the SVD of X.
+!>    This may be of interest if the scaling option is off
+!>    and as many as possible smallest eigenvalues are
+!>    desired to the highest feasible accuracy.
+!>    If the call to SGEDMD is only workspace query, then
+!>    WORK(1) contains the minimal workspace length and
+!>    WORK(2) is the optimal workspace length. Hence, the
+!>    length of work is at least 2.
+!>    See the description of LWORK.
+!>    \endverbatim
 !.....
-!     LWORK (input) INTEGER
-!     The minimal length of the workspace vector WORK.
-!     LWORK is calculated as follows:
-!     If WHTSVD == 1 ::
-!        If JOBZ == 'V', then
-!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)).
-!        If JOBZ == 'N'  then
-!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)).
-!        Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal
-!        workspace length of SGESVD.
-!     If WHTSVD == 2 ::
-!        If JOBZ == 'V', then
-!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N))
-!        If JOBZ == 'N', then
-!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N))
-!        Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the
-!        minimal workspace length of SGESDD.
-!     If WHTSVD == 3 ::
-!        If JOBZ == 'V', then
-!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
-!        If JOBZ == 'N', then
-!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
-!        Here LWORK_SVD = N+M+MAX(3*N+1,
-!                        MAX(1,3*N+M,5*N),MAX(1,N))
-!        is the minimal workspace length of SGESVDQ.
-!     If WHTSVD == 4 ::
-!        If JOBZ == 'V', then
-!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
-!        If JOBZ == 'N', then
-!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
-!        Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the
-!        minimal workspace length of SGEJSV.
-!     The above expressions are not simplified in order to
-!     make the usage of WORK more transparent, and for
-!     easier checking. In any case, LWORK >= 2.
-!     If on entry LWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     and the optimal workspace lengths for both WORK and
-!     IWORK. See the descriptions of WORK and IWORK.
+!>    \param[in] LWORK
+!>    \verbatim
+!>    LWORK (input) INTEGER
+!>    The minimal length of the workspace vector WORK.
+!>    LWORK is calculated as follows:
+!>    If WHTSVD == 1 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)).
+!>       If JOBZ == 'N'  then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)).
+!>       Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal
+!>       workspace length of SGESVD.
+!>    If WHTSVD == 2 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N))
+!>       If JOBZ == 'N', then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N))
+!>       Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the
+!>       minimal workspace length of SGESDD.
+!>    If WHTSVD == 3 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!>       If JOBZ == 'N', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!>       Here LWORK_SVD = N+M+MAX(3*N+1,
+!>                       MAX(1,3*N+M,5*N),MAX(1,N))
+!>       is the minimal workspace length of SGESVDQ.
+!>    If WHTSVD == 4 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!>       If JOBZ == 'N', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!>       Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the
+!>       minimal workspace length of SGEJSV.
+!>    The above expressions are not simplified in order to
+!>    make the usage of WORK more transparent, and for
+!>    easier checking. In any case, LWORK >= 2.
+!>    If on entry LWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths for both WORK and
+!>    IWORK. See the descriptions of WORK and IWORK.
+!>    \endverbatim
 !.....
-!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
-!     Workspace that is required only if WHTSVD equals
-!     2 , 3 or 4. (See the description of WHTSVD).
-!     If on entry LWORK =-1 or LIWORK=-1, then the
-!     minimal length of IWORK is computed and returned in
-!     IWORK(1). See the description of LIWORK.
+!>    \param[out] IWORK
+!>    \verbatim
+!>    IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!>    Workspace that is required only if WHTSVD equals
+!>    2 , 3 or 4. (See the description of WHTSVD).
+!>    If on entry LWORK =-1 or LIWORK=-1, then the
+!>    minimal length of IWORK is computed and returned in
+!>    IWORK(1). See the description of LIWORK.
+!>    \endverbatim
 !.....
-!     LIWORK (input) INTEGER
-!     The minimal length of the workspace vector IWORK.
-!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
-!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
-!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
-!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
-!     If on entry LIWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     and the optimal workspace lengths for both WORK and
-!     IWORK. See the descriptions of WORK and IWORK.
+!>    \param[in] LIWORK
+!>    \verbatim
+!>    LIWORK (input) INTEGER
+!>    The minimal length of the workspace vector IWORK.
+!>    If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!>    If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!>    If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!>    If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!>    If on entry LIWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths for both WORK and
+!>    IWORK. See the descriptions of WORK and IWORK.
+!>    \endverbatim
 !.....
-!     INFO (output) INTEGER
-!     -i < 0 :: On entry, the i-th argument had an
-!               illegal value
-!        = 0 :: Successful return.
-!        = 1 :: Void input. Quick exit (M=0 or N=0).
-!        = 2 :: The SVD computation of X did not converge.
-!               Suggestion: Check the input data and/or
-!               repeat with different WHTSVD.
-!        = 3 :: The computation of the eigenvalues did not
-!               converge.
-!        = 4 :: If data scaling was requested on input and
-!               the procedure found inconsistency in the data
-!               such that for some column index i,
-!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
-!               to zero if JOBS=='C'. The computation proceeds
-!               with original or modified data and warning
-!               flag is set with INFO=4.
+!>    \param[out] INFO
+!>    \verbatim
+!>    INFO (output) INTEGER
+!>    -i < 0 :: On entry, the i-th argument had an
+!>              illegal value
+!>       = 0 :: Successful return.
+!>       = 1 :: Void input. Quick exit (M=0 or N=0).
+!>       = 2 :: The SVD computation of X did not converge.
+!>              Suggestion: Check the input data and/or
+!>              repeat with different WHTSVD.
+!>       = 3 :: The computation of the eigenvalues did not
+!>              converge.
+!>       = 4 :: If data scaling was requested on input and
+!>              the procedure found inconsistency in the data
+!>              such that for some column index i,
+!>              X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!>              to zero if JOBS=='C'. The computation proceeds
+!>              with original or modified data and warning
+!>              flag is set with INFO=4.
+!>    \endverbatim
+!
+!  Authors:
+!  ========
+!
+!> \author Zlatko Drmac
+!
+!> \ingroup gedmd
+!
 !.............................................................
 !.............................................................
+      SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
+                         M, N, X, LDX, Y, LDY, NRNK, TOL,  &
+                         K, REIG,  IMEIG,   Z, LDZ,  RES,  &
+                         B, LDB, W,  LDW,   S, LDS,        &
+                         WORK, LWORK, IWORK, LIWORK, INFO )
+!
+!  -- LAPACK driver routine                                           --
+!
+!  -- LAPACK is a software package provided by University of          --
+!  -- Tennessee, University of California Berkeley, University of     --
+!  -- Colorado Denver and NAG Ltd..                                   --
+!
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real32
+!
+!     Scalar arguments
+!     ~~~~~~~~~~~~~~~~
+      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+                                 NRNK, LDZ, LDB, LDW,  LDS, &
+                                 LWORK,  LIWORK
+      INTEGER,   INTENT(OUT)  :: K, INFO
+      REAL(KIND=WP), INTENT(IN) ::  TOL
+!
+!     Array arguments
+!     ~~~~~~~~~~~~~~~
+      REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+      REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+                                      W(LDW,*), S(LDS,*)
+      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
+                                      RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!
 !     Parameters
 !     ~~~~~~~~~~
       REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
       REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
-
+!
 !     Local scalars
 !     ~~~~~~~~~~~~~
       REAL(KIND=WP) :: OFL,   ROOTSC, SCALE,  SMALL,   &
@@ -431,11 +581,11 @@
                         WNTEX, WNTREF, WNTRES, WNTVEC
       CHARACTER     ::  JOBZL, T_OR_N
       CHARACTER     ::  JSVOPT
-
+!
 !     Local arrays
 !     ~~~~~~~~~~~~
       REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2)
-
+!
 !     External functions (BLAS and LAPACK)
 !     ~~~~~~~~~~~~~~~~~
       REAL(KIND=WP) SLANGE, SLAMCH, SNRM2
@@ -443,13 +593,13 @@
       INTEGER       ISAMAX
       LOGICAL       SISNAN, LSAME
       EXTERNAL      SISNAN, LSAME
-
+!
 !     External subroutines (BLAS and LAPACK)
 !     ~~~~~~~~~~~~~~~~~~~~
       EXTERNAL      SAXPY,  SGEMM,  SSCAL
       EXTERNAL      SGEEV,  SGEJSV, SGESDD, SGESVD, SGESVDQ, &
                     SLACPY, SLASCL, SLASSQ, XERBLA
-
+!
 !     Intrinsic functions
 !     ~~~~~~~~~~~~~~~~~~~
       INTRINSIC     INT, FLOAT, MAX, SQRT
@@ -632,7 +782,8 @@
           K = 0
           DO i = 1, N
             !WORK(i) = DNRM2( M, X(1,i), 1 )
-            SCALE  = ZERO
+            SSUM  = ONE
+            SCALE = ZERO
             CALL SLASSQ( M, X(1,i), 1, SCALE, SSUM )
             IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
                 K    =  0
@@ -705,7 +856,8 @@
           ! carefully computed using SLASSQ.
           DO i = 1, N
             !WORK(i) = DNRM2( M, Y(1,i), 1 )
-            SCALE  = ZERO
+            SSUM  = ONE
+            SCALE = ZERO
             CALL SLASSQ( M, Y(1,i), 1, SCALE, SSUM )
             IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
                 K    =  0
diff --git a/lapack-netlib/SRC/sgehrd.f b/lapack-netlib/SRC/sgehrd.f
index 47733d947..cfa17e156 100644
--- a/lapack-netlib/SRC/sgehrd.f
+++ b/lapack-netlib/SRC/sgehrd.f
@@ -89,7 +89,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (LWORK)
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -173,7 +173,7 @@
       INTEGER            IHI, ILO, INFO, LDA, LWORK, N
 *     ..
 *     .. Array Arguments ..
-      REAL              A( LDA, * ), TAU( * ), WORK( * )
+      REAL               A( LDA, * ), TAU( * ), WORK( * )
 *     ..
 *
 *  =====================================================================
@@ -182,7 +182,7 @@
       INTEGER            NBMAX, LDT, TSIZE
       PARAMETER          ( NBMAX = 64, LDT = NBMAX+1,
      $                     TSIZE = LDT*NBMAX )
-      REAL              ZERO, ONE
+      REAL               ZERO, ONE
       PARAMETER          ( ZERO = 0.0E+0,
      $                     ONE = 1.0E+0 )
 *     ..
@@ -190,7 +190,7 @@
       LOGICAL            LQUERY
       INTEGER            I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB,
      $                   NBMIN, NH, NX
-      REAL              EI
+      REAL               EI
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SAXPY, SGEHD2, SGEMM, SLAHR2, SLARFB, STRMM,
@@ -222,13 +222,19 @@
          INFO = -8
       END IF
 *
+      NH = IHI - ILO + 1
       IF( INFO.EQ.0 ) THEN
 *
 *       Compute the workspace requirements
 *
-         NB = MIN( NBMAX, ILAENV( 1, 'SGEHRD', ' ', N, ILO, IHI, -1 ) )
-         LWKOPT = N*NB + TSIZE
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         IF( NH.LE.1 ) THEN
+            LWKOPT = 1
+         ELSE
+            NB = MIN( NBMAX, ILAENV( 1, 'SGEHRD', ' ', N, ILO, IHI,
+     $                              -1 ) )
+            LWKOPT = N*NB + TSIZE
+         ENDIF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -249,7 +255,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -269,7 +274,7 @@
 *
 *           Determine if workspace is large enough for blocked code
 *
-            IF( LWORK.LT.N*NB+TSIZE ) THEN
+            IF( LWORK.LT.LWKOPT ) THEN
 *
 *              Not enough workspace to use optimal NB:  determine the
 *              minimum value of NB, and reduce NB or force use of
@@ -345,7 +350,8 @@
 *     Use unblocked code to reduce the rest of the matrix
 *
       CALL SGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO )
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgelq.f b/lapack-netlib/SRC/sgelq.f
index 74c7cc267..75f02675d 100644
--- a/lapack-netlib/SRC/sgelq.f
+++ b/lapack-netlib/SRC/sgelq.f
@@ -98,7 +98,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -295,9 +295,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+          WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
         ELSE
-          WORK( 1 ) = SROUNDUP_LWORK(LWREQ)
+          WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -322,7 +322,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWREQ)
+      WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
       RETURN
 *
 *     End of SGELQ
diff --git a/lapack-netlib/SRC/sgelqf.f b/lapack-netlib/SRC/sgelqf.f
index 1ceec4742..3b3913d84 100644
--- a/lapack-netlib/SRC/sgelqf.f
+++ b/lapack-netlib/SRC/sgelqf.f
@@ -93,7 +93,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,M).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise.
 *>          For optimum performance LWORK >= M*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -175,9 +176,8 @@
 *     Test the input arguments
 *
       INFO = 0
+      K = MIN( M, N )
       NB = ILAENV( 1, 'SGELQF', ' ', M, N, -1, -1 )
-      LWKOPT = M*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -185,19 +185,25 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN
-         INFO = -7
+      ELSE IF( .NOT.LQUERY ) THEN
+         IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
+     $      INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SGELQF', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
+         IF( K.EQ.0 ) THEN
+            LWKOPT = 1
+         ELSE
+            LWKOPT = M*NB
+         END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -267,7 +273,7 @@
      $   CALL SGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = SROUNDUP_LWORK(IWS)
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of SGELQF
diff --git a/lapack-netlib/SRC/sgemlq.f b/lapack-netlib/SRC/sgemlq.f
index 83536825c..7e4d9bf65 100644
--- a/lapack-netlib/SRC/sgemlq.f
+++ b/lapack-netlib/SRC/sgemlq.f
@@ -110,13 +110,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
 *>          value as WORK(1), and no error message related to WORK
@@ -187,7 +188,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -207,7 +208,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -222,6 +223,13 @@
         LW = M * MB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN
@@ -250,12 +258,12 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = SROUNDUP_LWORK( LW )
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -267,7 +275,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -280,7 +288,7 @@
      $                 MB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK( LW )
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgemqr.f b/lapack-netlib/SRC/sgemqr.f
index 3207f8bfd..19bf467b8 100644
--- a/lapack-netlib/SRC/sgemqr.f
+++ b/lapack-netlib/SRC/sgemqr.f
@@ -189,12 +189,13 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
+      EXTERNAL           LSAME
       REAL               SROUNDUP_LWORK
-      EXTERNAL           LSAME, SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMQRT, SLAMTSQR, XERBLA
@@ -206,7 +207,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -221,6 +222,13 @@
         LW = MB * NB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, MB - K ).EQ.0 ) THEN
@@ -249,12 +257,12 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = SROUNDUP_LWORK(LW)
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -266,7 +274,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -279,7 +287,7 @@
      $                 NB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LW)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgeqlf.f b/lapack-netlib/SRC/sgeqlf.f
index b1266c89e..14942b765 100644
--- a/lapack-netlib/SRC/sgeqlf.f
+++ b/lapack-netlib/SRC/sgeqlf.f
@@ -88,7 +88,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -189,8 +190,9 @@
          END IF
          WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
-         IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
-            INFO = -7
+         IF( .NOT.LQUERY ) THEN
+            IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) )
+     $         INFO = -7
          END IF
       END IF
 *
diff --git a/lapack-netlib/SRC/sgeqp3rk.c b/lapack-netlib/SRC/sgeqp3rk.c
new file mode 100644
index 000000000..fe52901bf
--- /dev/null
+++ b/lapack-netlib/SRC/sgeqp3rk.c
@@ -0,0 +1,1055 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static integer c__3 = 3;
+static integer c__2 = 2;
+
+/* Subroutine */ int sgeqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*kmax, real *abstol, real *reltol, real *a, integer *lda, integer *k, 
+	real *maxc2nrmk, real *relmaxc2nrmk, integer *jpiv, real *tau, real *
+	work, integer *lwork, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    real r__1, r__2;
+
+    /* Local variables */
+    real maxc2nrm;
+    extern /* Subroutine */ int slaqp2rk_(integer *, integer *, integer *, 
+	    integer *, integer *, real *, real *, integer *, real *, real *, 
+	    integer *, integer *, real *, real *, integer *, real *, real *, 
+	    real *, real *, integer *), slaqp3rk_(integer *, integer *, 
+	    integer *, integer *, integer *, real *, real *, integer *, real *
+	    , real *, integer *, logical *, integer *, real *, real *, 
+	    integer *, real *, real *, real *, real *, real *, integer *, 
+	    integer *, integer *);
+    logical done;
+    integer jmax;
+    extern real snrm2_(integer *, real *, integer *);
+    integer j, jmaxc2nrm, jmaxb, nbmin, iinfo, n_sub__, minmn;
+    real myhugeval;
+    integer jb, nb, kf, nx;
+    extern real slamch_(char *);
+    real safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen), isamax_(integer *, real *, 
+	    integer *);
+    extern logical sisnan_(real *);
+    integer kp1, lwkopt;
+    logical lquery;
+    integer jbf;
+    real eps;
+    integer iws, ioffset;
+
+
+/*  -- LAPACK computational routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test input arguments */
+/*     ==================== */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*kmax < 0) {
+	*info = -4;
+    } else if (sisnan_(abstol)) {
+	*info = -5;
+    } else if (sisnan_(reltol)) {
+	*info = -6;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -8;
+    }
+
+/*     If the input parameters M, N, NRHS, KMAX, LDA are valid: */
+/*       a) Test the input workspace size LWORK for the minimum */
+/*          size requirement IWS. */
+/*       b) Determine the optimal block size NB and optimal */
+/*          workspace size LWKOPT to be returned in WORK(1) */
+/*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., */
+/*          (3) when routine exits. */
+/*     Here, IWS is the miminum workspace required for unblocked */
+/*     code. */
+
+    if (*info == 0) {
+	minmn = f2cmin(*m,*n);
+	if (minmn == 0) {
+	    iws = 1;
+	    lwkopt = 1;
+	} else {
+
+/*           Minimal workspace size in case of using only unblocked */
+/*           BLAS 2 code in SLAQP2RK. */
+/*           1) SGEQP3RK and SLAQP2RK: 2*N to store full and partial */
+/*              column 2-norms. */
+/*           2) SLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in SLARF subroutine inside SLAQP2RK to apply an */
+/*              elementary reflector from the left. */
+/*           TOTAL_WORK_SIZE = 3*N + NRHS - 1 */
+
+	    iws = *n * 3 + *nrhs - 1;
+
+/*           Assign to NB optimal block size. */
+
+	    nb = ilaenv_(&c__1, "SGEQP3RK", " ", m, n, &c_n1, &c_n1, (ftnlen)
+		    8, (ftnlen)1);
+
+/*           A formula for the optimal workspace size in case of using */
+/*           both unblocked BLAS 2 in SLAQP2RK and blocked BLAS 3 code */
+/*           in SLAQP3RK. */
+/*           1) SGEQP3RK, SLAQP2RK, SLAQP3RK: 2*N to store full and */
+/*              partial column 2-norms. */
+/*           2) SLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in SLARF subroutine to apply an elementary reflector */
+/*              from the left. */
+/*           3) SLAQP3RK: NB*(N+NRHS) to use in the work array F that */
+/*              is used to apply a block reflector from */
+/*              the left. */
+/*           4) SLAQP3RK: NB to use in the auxilixary array AUX. */
+/*           Sizes (2) and ((3) + (4)) should intersect, therefore */
+/*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. */
+
+	    lwkopt = (*n << 1) + nb * (*n + *nrhs + 1);
+	}
+	work[1] = (real) lwkopt;
+
+	if (*lwork < iws && ! lquery) {
+	    *info = -15;
+	}
+    }
+
+/*      NOTE: The optimal workspace size is returned in WORK(1), if */
+/*            the input parameters M, N, NRHS, KMAX, LDA are valid. */
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEQP3RK", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible for M=0 or N=0. */
+
+    if (minmn == 0) {
+	*k = 0;
+	*maxc2nrmk = 0.f;
+	*relmaxc2nrmk = 0.f;
+	work[1] = (real) lwkopt;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize column pivot array JPIV. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	jpiv[j] = j;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize storage for partial and exact column 2-norms. */
+/*     a) The elements WORK(1:N) are used to store partial column */
+/*        2-norms of the matrix A, and may decrease in each computation */
+/*        step; initialize to the values of complete columns 2-norms. */
+/*     b) The elements WORK(N+1:2*N) are used to store complete column */
+/*        2-norms of the matrix A, they are not changed during the */
+/*        computation; initialize the values of complete columns 2-norms. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	work[j] = snrm2_(m, &a[j * a_dim1 + 1], &c__1);
+	work[*n + j] = work[j];
+    }
+
+/*     ================================================================== */
+
+/*     Compute the pivot column index and the maximum column 2-norm */
+/*     for the whole original matrix stored in A(1:M,1:N). */
+
+    kp1 = isamax_(n, &work[1], &c__1);
+    maxc2nrm = work[kp1];
+
+/*     ==================================================================. */
+
+    if (sisnan_(&maxc2nrm)) {
+
+/*        Check if the matrix A contains NaN, set INFO parameter */
+/*        to the column number where the first NaN is found and return */
+/*        from the routine. */
+
+	*k = 0;
+	*info = kp1;
+
+/*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = maxc2nrm;
+
+/*        Array TAU is not set and contains undefined elements. */
+
+	work[1] = (real) lwkopt;
+	return 0;
+    }
+
+/*     =================================================================== */
+
+    if (maxc2nrm == 0.f) {
+
+/*        Check is the matrix A is a zero matrix, set array TAU and */
+/*        return from the routine. */
+
+	*k = 0;
+	*maxc2nrmk = 0.f;
+	*relmaxc2nrmk = 0.f;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    tau[j] = 0.f;
+	}
+
+	work[1] = (real) lwkopt;
+	return 0;
+
+    }
+
+/*     =================================================================== */
+
+    myhugeval = slamch_("Overflow");
+
+    if (maxc2nrm > myhugeval) {
+
+/*        Check if the matrix A contains +Inf or -Inf, set INFO parameter */
+/*        to the column number, where the first +/-Inf  is found plus N, */
+/*        and continue the computation. */
+
+	*info = *n + kp1;
+
+    }
+
+/*     ================================================================== */
+
+/*     Quick return if possible for the case when the first */
+/*     stopping criterion is satisfied, i.e. KMAX = 0. */
+
+    if (*kmax == 0) {
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.f;
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    tau[j] = 0.f;
+	}
+	work[1] = (real) lwkopt;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+    eps = slamch_("Epsilon");
+
+/*     Adjust ABSTOL */
+
+    if (*abstol >= 0.f) {
+	safmin = slamch_("Safe minimum");
+/* Computing MAX */
+	r__1 = *abstol, r__2 = safmin * 2.f;
+	*abstol = f2cmax(r__1,r__2);
+    }
+
+/*     Adjust RELTOL */
+
+    if (*reltol >= 0.f) {
+	*reltol = f2cmax(*reltol,eps);
+    }
+
+/*     =================================================================== */
+
+/*     JMAX is the maximum index of the column to be factorized, */
+/*     which is also limited by the first stopping criterion KMAX. */
+
+    jmax = f2cmin(*kmax,minmn);
+
+/*     =================================================================== */
+
+/*     Quick return if possible for the case when the second or third */
+/*     stopping criterion for the whole original matrix is satified, */
+/*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */
+/*     (which is ONE <= RELTOL). */
+
+    if (maxc2nrm <= *abstol || 1.f <= *reltol) {
+
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.f;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    tau[j] = 0.f;
+	}
+
+	work[1] = (real) lwkopt;
+	return 0;
+    }
+
+/*     ================================================================== */
+/*     Factorize columns */
+/*     ================================================================== */
+
+/*     Determine the block size. */
+
+    nbmin = 2;
+    nx = 0;
+
+    if (nb > 1 && nb < minmn) {
+
+/*        Determine when to cross over from blocked to unblocked code. */
+/*        (for N less than NX, unblocked code should be used). */
+
+/* Computing MAX */
+	i__1 = 0, i__2 = ilaenv_(&c__3, "SGEQP3RK", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)8, (ftnlen)1);
+	nx = f2cmax(i__1,i__2);
+
+	if (nx < minmn) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    if (*lwork < lwkopt) {
+
+/*              Not enough workspace to use optimal block size that */
+/*              is currently stored in NB. */
+/*              Reduce NB and determine the minimum value of NB. */
+
+		nb = (*lwork - (*n << 1)) / (*n + 1);
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "SGEQP3RK", " ", m, n, &c_n1, 
+			&c_n1, (ftnlen)8, (ftnlen)1);
+		nbmin = f2cmax(i__1,i__2);
+
+	    }
+	}
+    }
+
+/*     ================================================================== */
+
+/*     DONE is the boolean flag to rerpresent the case when the */
+/*     factorization completed in the block factorization routine, */
+/*     before the end of the block. */
+
+    done = FALSE_;
+
+/*     J is the column index. */
+
+    j = 1;
+
+/*     (1) Use blocked code initially. */
+
+/*     JMAXB is the maximum column index of the block, when the */
+/*     blocked code is used, is also limited by the first stopping */
+/*     criterion KMAX. */
+
+/* Computing MIN */
+    i__1 = *kmax, i__2 = minmn - nx;
+    jmaxb = f2cmin(i__1,i__2);
+
+    if (nb >= nbmin && nb < jmax && jmaxb > 0) {
+
+/*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */
+/*        J   is the column index of a column block; */
+/*        JB  is the column block size to pass to block factorization */
+/*            routine in a loop step; */
+/*        JBF is the number of columns that were actually factorized */
+/*            that was returned by the block factorization routine */
+/*            in a loop step, JBF <= JB; */
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	while(j <= jmaxb) {
+
+/* Computing MIN */
+	    i__1 = nb, i__2 = jmaxb - j + 1;
+	    jb = f2cmin(i__1,i__2);
+	    n_sub__ = *n - j + 1;
+	    ioffset = j - 1;
+
+/*           Factorize JB columns among the columns A(J:N). */
+
+	    i__1 = *n + *nrhs - j + 1;
+	    slaqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, 
+		    &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, 
+		    maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], &
+		    work[*n + j], &work[(*n << 1) + 1], &work[(*n << 1) + jb 
+		    + 1], &i__1, &iwork[1], &iinfo);
+
+/*           Set INFO on the first occurence of Inf. */
+
+	    if (iinfo > n_sub__ && *info == 0) {
+		*info = (ioffset << 1) + iinfo;
+	    }
+
+	    if (done) {
+
+/*              Either the submatrix is zero before the end of the */
+/*              column block, or ABSTOL or RELTOL criterion is */
+/*              satisfied before the end of the column block, we can */
+/*              return from the routine. Perform the following before */
+/*              returning: */
+/*                a) Set the number of factorized columns K, */
+/*                   K = IOFFSET + JBF from the last call of blocked */
+/*                   routine. */
+/*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */
+/*                         by the block factorization routine; */
+/*                      2) The remaining TAUs are set to ZERO by the */
+/*                         block factorization routine. */
+
+		*k = ioffset + jbf;
+
+/*              Set INFO on the first occurrence of NaN, NaN takes */
+/*              prcedence over Inf. */
+
+		if (iinfo <= n_sub__ && iinfo > 0) {
+		    *info = ioffset + iinfo;
+		}
+
+/*              Return from the routine. */
+
+		work[1] = (real) lwkopt;
+
+		return 0;
+
+	    }
+
+	    j += jbf;
+
+	}
+
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+/*     J = JMAX+1 means we factorized the maximum possible number of */
+/*     columns, that is in ELSE clause we need to compute */
+/*     the MAXC2NORM and RELMAXC2NORM to return after we processed */
+/*     the blocks. */
+
+    if (j <= jmax) {
+
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	n_sub__ = *n - j + 1;
+	ioffset = j - 1;
+
+	i__1 = jmax - j + 1;
+	slaqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, &
+		maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, 
+		relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], &work[*n + j], &
+		work[(*n << 1) + 1], &iinfo);
+
+/*        ABSTOL or RELTOL criterion is satisfied when the number of */
+/*        the factorized columns KF is smaller then the  number */
+/*        of columns JMAX-J+1 supplied to be factorized by the */
+/*        unblocked routine, we can return from */
+/*        the routine. Perform the following before returning: */
+/*           a) Set the number of factorized columns K, */
+/*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the */
+/*              unblocked factorization routine above. */
+
+	*k = j - 1 + kf;
+
+/*        Set INFO on the first exception occurence. */
+
+/*        Set INFO on the first exception occurence of Inf or NaN, */
+/*        (NaN takes precedence over Inf). */
+
+	if (iinfo > n_sub__ && *info == 0) {
+	    *info = (ioffset << 1) + iinfo;
+	} else if (iinfo <= n_sub__ && iinfo > 0) {
+	    *info = ioffset + iinfo;
+	}
+
+    } else {
+
+/*        Compute the return values for blocked code. */
+
+/*        Set the number of factorized columns if the unblocked routine */
+/*        was not called. */
+
+	*k = jmax;
+
+/*        If there exits a residual matrix after the blocked code: */
+/*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */
+/*              residual matrix, otherwise set them to ZERO; */
+/*           2) Set TAU(K+1:MINMN) to ZERO. */
+
+	if (*k < minmn) {
+	    i__1 = *n - *k;
+	    jmaxc2nrm = *k + isamax_(&i__1, &work[*k + 1], &c__1);
+	    *maxc2nrmk = work[jmaxc2nrm];
+	    if (*k == 0) {
+		*relmaxc2nrmk = 1.f;
+	    } else {
+		*relmaxc2nrmk = *maxc2nrmk / maxc2nrm;
+	    }
+
+	    i__1 = minmn;
+	    for (j = *k + 1; j <= i__1; ++j) {
+		tau[j] = 0.f;
+	    }
+
+	}
+
+/*     END IF( J.LE.JMAX ) THEN */
+
+    }
+
+    work[1] = (real) lwkopt;
+
+    return 0;
+
+/*     End of SGEQP3RK */
+
+} /* sgeqp3rk_ */
+
diff --git a/lapack-netlib/SRC/sgeqp3rk.f b/lapack-netlib/SRC/sgeqp3rk.f
new file mode 100644
index 000000000..d3a335b88
--- /dev/null
+++ b/lapack-netlib/SRC/sgeqp3rk.f
@@ -0,0 +1,1083 @@
+*> \brief \b SGEQP3RK computes a truncated Householder QR factorization with column pivoting of a real m-by-n matrix A by using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SGEQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sgeqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sgeqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sgeqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+*      $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*      $                     WORK, LWORK, IWORK, INFO )
+*       IMPLICIT NONE
+*
+*      .. Scalar Arguments ..
+*       INTEGER            INFO, K, KMAX, LDA, LWORK, M, N, NRHS
+*       REAL               ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*      ..
+*      .. Array Arguments ..
+*       INTEGER            IWORK( * ), JPIV( * )
+*       REAL               A( LDA, * ), TAU( * ), WORK( * )
+*      ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SGEQP3RK performs two tasks simultaneously:
+*>
+*> Task 1: The routine computes a truncated (rank K) or full rank
+*> Householder QR factorization with column pivoting of a real
+*> M-by-N matrix A using Level 3 BLAS. K is the number of columns
+*> that were factorized, i.e. factorization rank of the
+*> factor R, K <= min(M,N).
+*>
+*>  A * P(K) = Q(K) * R(K)  =
+*>
+*>        = Q(K) * ( R11(K) R12(K) ) = Q(K) * (   R(K)_approx    )
+*>                 ( 0      R22(K) )          ( 0  R(K)_residual ),
+*>
+*> where:
+*>
+*>  P(K)            is an N-by-N permutation matrix;
+*>  Q(K)            is an M-by-M orthogonal matrix;
+*>  R(K)_approx   = ( R11(K), R12(K) ) is a rank K approximation of the
+*>                    full rank factor R with K-by-K upper-triangular
+*>                    R11(K) and K-by-N rectangular R12(K). The diagonal
+*>                    entries of R11(K) appear in non-increasing order
+*>                    of absolute value, and absolute values of all of
+*>                    them exceed the maximum column 2-norm of R22(K)
+*>                    up to roundoff error.
+*>  R(K)_residual = R22(K) is the residual of a rank K approximation
+*>                    of the full rank factor R. It is a
+*>                    an (M-K)-by-(N-K) rectangular matrix;
+*>  0               is a an (M-K)-by-K zero matrix.
+*>
+*> Task 2: At the same time, the routine overwrites a real M-by-NRHS
+*> matrix B with  Q(K)**T * B  using Level 3 BLAS.
+*>
+*> =====================================================================
+*>
+*> The matrices A and B are stored on input in the array A as
+*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS)
+*> respectively.
+*>
+*>                                  N     NRHS
+*>             array_A   =   M  [ mat_A, mat_B ]
+*>
+*> The truncation criteria (i.e. when to stop the factorization)
+*> can be any of the following:
+*>
+*>   1) The input parameter KMAX, the maximum number of columns
+*>      KMAX to factorize, i.e. the factorization rank is limited
+*>      to KMAX. If KMAX >= min(M,N), the criterion is not used.
+*>
+*>   2) The input parameter ABSTOL, the absolute tolerance for
+*>      the maximum column 2-norm of the residual matrix R22(K). This
+*>      means that the factorization stops if this norm is less or
+*>      equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used.
+*>
+*>   3) The input parameter RELTOL, the tolerance for the maximum
+*>      column 2-norm matrix of the residual matrix R22(K) divided
+*>      by the maximum column 2-norm of the original matrix A, which
+*>      is equal to abs(R(1,1)). This means that the factorization stops
+*>      when the ratio of the maximum column 2-norm of R22(K) to
+*>      the maximum column 2-norm of A is less than or equal to RELTOL.
+*>      If RELTOL < 0.0, the criterion is not used.
+*>
+*>   4) In case both stopping criteria ABSTOL or RELTOL are not used,
+*>      and when the residual matrix R22(K) is a zero matrix in some
+*>      factorization step K. ( This stopping criterion is implicit. )
+*>
+*>  The algorithm stops when any of these conditions is first
+*>  satisfied, otherwise the whole matrix A is factorized.
+*>
+*>  To factorize the whole matrix A, use the values
+*>  KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0.
+*>
+*>  The routine returns:
+*>     a) Q(K), R(K)_approx = ( R11(K), R12(K) ),
+*>        R(K)_residual = R22(K), P(K), i.e. the resulting matrices
+*>        of the factorization; P(K) is represented by JPIV,
+*>        ( if K = min(M,N), R(K)_approx is the full factor R,
+*>        and there is no residual matrix R(K)_residual);
+*>     b) K, the number of columns that were factorized,
+*>        i.e. factorization rank;
+*>     c) MAXC2NRMK, the maximum column 2-norm of the residual
+*>        matrix R(K)_residual = R22(K),
+*>        ( if K = min(M,N), MAXC2NRMK = 0.0 );
+*>     d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum
+*>        column 2-norm of the original matrix A, which is equal
+*>        to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 );
+*>     e) Q(K)**T * B, the matrix B with the orthogonal
+*>        transformation Q(K)**T applied on the left.
+*>
+*> The N-by-N permutation matrix P(K) is stored in a compact form in
+*> the integer array JPIV. For 1 <= j <= N, column j
+*> of the matrix A was interchanged with column JPIV(j).
+*>
+*> The M-by-M orthogonal matrix Q is represented as a product
+*> of elementary Householder reflectors
+*>
+*>     Q(K) = H(1) *  H(2) * . . . * H(K),
+*>
+*> where K is the number of columns that were factorized.
+*>
+*> Each H(j) has the form
+*>
+*>     H(j) = I - tau * v * v**T,
+*>
+*> where 1 <= j <= K and
+*>   I    is an M-by-M identity matrix,
+*>   tau  is a real scalar,
+*>   v    is a real vector with v(1:j-1) = 0 and v(j) = 1.
+*>
+*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j).
+*>
+*> See the Further Details section for more information.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e. the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M,N), then this stopping criterion
+*>                is not used, the routine factorizes columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>                satisfied on input and the routine exits immediately.
+*>                This means that the factorization is not performed,
+*>                the matrices A and B are not modified, and
+*>                the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL
+*>
+*>          The second factorization stopping criterion, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix R22(K).
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix R22(K)
+*>          is less than or equal to ABSTOL. Let SAFMIN = SLAMCH('S').
+*>
+*>          a) If ABSTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -5 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN
+*>                is used. This includes the case ABSTOL = -0.0.
+*>
+*>          d) If 2*SAFMIN <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If ABSTOL chosen above is >= MAXC2NRM, then this
+*>          stopping criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed. The routine
+*>          returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case ABSTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is REAL
+*>
+*>          The third factorization stopping criterion, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio
+*>          abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of
+*>          the residual matrix R22(K) to the maximum column 2-norm of
+*>          the original matrix A. The algorithm converges (stops the
+*>          factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less
+*>          than or equal to RELTOL. Let EPS = SLAMCH('E').
+*>
+*>          a) If RELTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -6 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used.
+*>                This includes the case RELTOL = -0.0.
+*>
+*>          d) If EPS <= RELTOL then the input value of RELTOL
+*>                is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If RELTOL chosen above is >= 1.0, then this stopping
+*>          criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed.
+*>          The routine returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case RELTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*>
+*>          NOTE: We recommend that RELTOL satisfy
+*>                min( max(M,N)*EPS, sqrt(EPS) ) <= RELTOL
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N+NRHS)
+*>
+*>          On entry:
+*>
+*>          a) The subarray A(1:M,1:N) contains the M-by-N matrix A.
+*>          b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS
+*>             matrix B.
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>
+*>          a) The subarray A(1:M,1:N) contains parts of the factors
+*>             of the matrix A:
+*>
+*>            1) If K = 0, A(1:M,1:N) contains the original matrix A.
+*>            2) If K > 0, A(1:M,1:N) contains parts of the
+*>            factors:
+*>
+*>              1. The elements below the diagonal of the subarray
+*>                 A(1:M,1:K) together with TAU(1:K) represent the
+*>                 orthogonal matrix Q(K) as a product of K Householder
+*>                 elementary reflectors.
+*>
+*>              2. The elements on and above the diagonal of
+*>                 the subarray A(1:K,1:N) contain K-by-N
+*>                 upper-trapezoidal matrix
+*>                 R(K)_approx = ( R11(K), R12(K) ).
+*>                 NOTE: If K=min(M,N), i.e. full rank factorization,
+*>                       then R_approx(K) is the full factor R which
+*>                       is upper-trapezoidal. If, in addition, M>=N,
+*>                       then R is upper-triangular.
+*>
+*>              3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K)
+*>                 rectangular matrix R(K)_residual = R22(K).
+*>
+*>          b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains
+*>             the M-by-NRHS product Q(K)**T * B.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*>          This is the leading dimension for both matrices, A and B.
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*>
+*>          NOTE: If K = 0, a) the arrays A and B are not modified;
+*>                          b) the array TAU(1:min(M,N)) is set to ZERO,
+*>                             if the matrix A does not contain NaN,
+*>                             otherwise the elements TAU(1:min(M,N))
+*>                             are undefined;
+*>                          c) the elements of the array JPIV are set
+*>                             as follows: for j = 1:N, JPIV(j) = j.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is REAL
+*>          The maximum column 2-norm of the residual matrix R22(K),
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then MAXC2NRMK equals the maximum column 2-norm
+*>             of the original matrix A.
+*>
+*>          b) If 0 < K < min(M,N), then MAXC2NRMK is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then MAXC2NRMK = 0.0.
+*>
+*>          NOTE: MAXC2NRMK in the factorization step K would equal
+*>                R(K+1,K+1) in the next factorization step K+1.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is REAL
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix R22(K) (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then RELMAXC2NRMK = 1.0.
+*>
+*>          b) If 0 < K < min(M,N), then
+*>                RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then RELMAXC2NRMK = 0.0.
+*>
+*>         NOTE: RELMAXC2NRMK in the factorization step K would equal
+*>               abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization
+*>               step K+1.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices. For 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*>
+*>          The elements of the array JPIV(1:N) are always set
+*>          by the routine, for example, even  when no columns
+*>          were factorized, i.e. when K = 0, the elements are
+*>          set as JPIV(j) = j for j = 1:N.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is REAL array, dimension (min(M,N))
+*>          The scalar factors of the elementary reflectors.
+*>
+*>          If 0 < K <= min(M,N), only the elements TAU(1:K) of
+*>          the array TAU are modified by the factorization.
+*>          After the factorization computed, if no NaN was found
+*>          during the factorization, the remaining elements
+*>          TAU(K+1:min(M,N)) are set to zero, otherwise the
+*>          elements TAU(K+1:min(M,N)) are not set and therefore
+*>          undefined.
+*>          ( If K = 0, all elements of TAU are set to zero, if
+*>          the matrix A does not contain NaN. )
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and
+*>          LWORK >= (3*N+NRHS-1), otherwise.
+*>          For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )),
+*>          where NB is the optimal block size for SGEQP3RK returned
+*>          by ILAENV. Minimal block size MINNB=2.
+*>
+*>          NOTE: The decision, whether to use unblocked BLAS 2
+*>          or blocked BLAS 3 code is based not only on the dimension
+*>          LWORK of the availbale workspace WORK, but also also on the
+*>          matrix A dimension N via crossover point NX returned
+*>          by ILAENV. (For N less than NX, unblocked code should be
+*>          used.)
+*>
+*>          If LWORK = -1, then a workspace query is assumed;
+*>          the routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix in the blocked step auxiliary subroutine SLAQP3RK ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) INFO < 0: if INFO = -i, the i-th argument had an
+*>                       illegal value.
+*>          3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup geqp3rk
+*
+*> \par Further Details:
+*  =====================
+*
+*> \verbatim
+*> SGEQP3RK is based on the same BLAS3 Householder QR factorization
+*> algorithm with column pivoting as in SGEQP3 routine which uses
+*> SLARFG routine to generate Householder reflectors
+*> for QR factorization.
+*>
+*> We can also write:
+*>
+*>   A = A_approx(K) + A_residual(K)
+*>
+*> The low rank approximation matrix A(K)_approx from
+*> the truncated QR factorization of rank K of the matrix A is:
+*>
+*>   A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T
+*>                        (     0     0 )
+*>
+*>               = Q(K) * ( R11(K) R12(K) ) * P(K)**T
+*>                        (      0      0 )
+*>
+*> The residual A_residual(K) of the matrix A is:
+*>
+*>   A_residual(K) = Q(K) * ( 0              0 ) * P(K)**T =
+*>                          ( 0  R(K)_residual )
+*>
+*>                 = Q(K) * ( 0        0 ) * P(K)**T
+*>                          ( 0   R22(K) )
+*>
+*> The truncated (rank K) factorization guarantees that
+*> the maximum column 2-norm of A_residual(K) is less than
+*> or equal to MAXC2NRMK up to roundoff error.
+*>
+*> NOTE: An approximation of the null vectors
+*>       of A can be easily computed from R11(K)
+*>       and R12(K):
+*>
+*>       Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) )
+*>                                 (         -I           )
+*>
+*> \endverbatim
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+     $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     WORK, LWORK, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS
+      REAL               ABSTOL,  MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      REAL               A( LDA, * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            INB, INBMIN, IXOVER
+      PARAMETER          ( INB = 1, INBMIN = 2, IXOVER = 3 )
+      REAL               ZERO, ONE, TWO
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, DONE
+      INTEGER            IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX,
+     $                   JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB,
+     $                   NBMIN, NX
+      REAL               EPS, HUGEVAL, MAXC2NRM, SAFMIN
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLAQP2RK, SLAQP3RK, XERBLA
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      INTEGER            ISAMAX, ILAENV
+      REAL               SLAMCH, SNRM2, SROUNDUP_LWORK
+      EXTERNAL           SISNAN, SLAMCH, SNRM2, ISAMAX, ILAENV,
+     $                   SROUNDUP_LWORK
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test input arguments
+*     ====================
+*
+      INFO = 0
+      LQUERY = ( LWORK.EQ.-1 )
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KMAX.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( SISNAN( ABSTOL ) ) THEN
+         INFO = -5
+      ELSE IF( SISNAN( RELTOL ) ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -8
+      END IF
+*
+*     If the input parameters M, N, NRHS, KMAX, LDA are valid:
+*       a) Test the input workspace size LWORK for the minimum
+*          size requirement IWS.
+*       b) Determine the optimal block size NB and optimal
+*          workspace size LWKOPT to be returned in WORK(1)
+*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE.,
+*          (3) when routine exits.
+*     Here, IWS is the miminum workspace required for unblocked
+*     code.
+*
+      IF( INFO.EQ.0 ) THEN
+         MINMN = MIN( M, N )
+         IF( MINMN.EQ.0 ) THEN
+            IWS = 1
+            LWKOPT = 1
+         ELSE
+*
+*           Minimal workspace size in case of using only unblocked
+*           BLAS 2 code in SLAQP2RK.
+*           1) SGEQP3RK and SLAQP2RK: 2*N to store full and partial
+*              column 2-norms.
+*           2) SLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in SLARF subroutine inside SLAQP2RK to apply an
+*              elementary reflector from the left.
+*           TOTAL_WORK_SIZE = 3*N + NRHS - 1
+*
+            IWS = 3*N + NRHS - 1
+*
+*           Assign to NB optimal block size.
+*
+            NB = ILAENV( INB, 'SGEQP3RK', ' ', M, N, -1, -1 )
+*
+*           A formula for the optimal workspace size in case of using
+*           both unblocked BLAS 2 in SLAQP2RK and blocked BLAS 3 code
+*           in SLAQP3RK.
+*           1) SGEQP3RK, SLAQP2RK, SLAQP3RK: 2*N to store full and
+*              partial column 2-norms.
+*           2) SLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in SLARF subroutine to apply an elementary reflector
+*              from the left.
+*           3) SLAQP3RK: NB*(N+NRHS) to use in the work array F that
+*              is used to apply a block reflector from
+*              the left.
+*           4) SLAQP3RK: NB to use in the auxilixary array AUX.
+*           Sizes (2) and ((3) + (4)) should intersect, therefore
+*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2.
+*
+            LWKOPT = 2*N + NB*( N+NRHS+1 )
+         END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
+         IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+*      NOTE: The optimal workspace size is returned in WORK(1), if
+*            the input parameters M, N, NRHS, KMAX, LDA are valid.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SGEQP3RK', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible for M=0 or N=0.
+*
+      IF( MINMN.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+*     Initialize column pivot array JPIV.
+*
+      DO J = 1, N
+         JPIV( J ) = J
+      END DO
+*
+*     ==================================================================
+*
+*     Initialize storage for partial and exact column 2-norms.
+*     a) The elements WORK(1:N) are used to store partial column
+*        2-norms of the matrix A, and may decrease in each computation
+*        step; initialize to the values of complete columns 2-norms.
+*     b) The elements WORK(N+1:2*N) are used to store complete column
+*        2-norms of the matrix A, they are not changed during the
+*        computation; initialize the values of complete columns 2-norms.
+*
+      DO J = 1, N
+         WORK( J ) = SNRM2( M, A( 1, J ), 1 )
+         WORK( N+J ) = WORK( J )
+      END DO
+*
+*     ==================================================================
+*
+*     Compute the pivot column index and the maximum column 2-norm
+*     for the whole original matrix stored in A(1:M,1:N).
+*
+      KP1 = ISAMAX( N, WORK( 1 ), 1 )
+      MAXC2NRM = WORK( KP1 )
+*
+*     ==================================================================.
+*
+      IF( SISNAN( MAXC2NRM ) ) THEN
+*
+*        Check if the matrix A contains NaN, set INFO parameter
+*        to the column number where the first NaN is found and return
+*        from the routine.
+*
+         K = 0
+         INFO = KP1
+*
+*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = MAXC2NRM
+*
+*        Array TAU is not set and contains undefined elements.
+*
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+      END IF
+*
+*     ===================================================================
+*
+      IF( MAXC2NRM.EQ.ZERO ) THEN
+*
+*        Check is the matrix A is a zero matrix, set array TAU and
+*        return from the routine.
+*
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+*
+         DO J = 1, MINMN
+            TAU( J ) = ZERO
+         END DO
+*
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+*
+      END IF
+*
+*     ===================================================================
+*
+      HUGEVAL = SLAMCH( 'Overflow' )
+*
+      IF( MAXC2NRM.GT.HUGEVAL ) THEN
+*
+*        Check if the matrix A contains +Inf or -Inf, set INFO parameter
+*        to the column number, where the first +/-Inf  is found plus N,
+*        and continue the computation.
+*
+         INFO = N + KP1
+*
+      END IF
+*
+*     ==================================================================
+*
+*     Quick return if possible for the case when the first
+*     stopping criterion is satisfied, i.e. KMAX = 0.
+*
+      IF( KMAX.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+         DO J = 1, MINMN
+            TAU( J ) = ZERO
+         END DO
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+      EPS = SLAMCH('Epsilon')
+*
+*     Adjust ABSTOL
+*
+      IF( ABSTOL.GE.ZERO ) THEN
+         SAFMIN = SLAMCH('Safe minimum')
+         ABSTOL = MAX( ABSTOL, TWO*SAFMIN )
+      END IF
+*
+*     Adjust RELTOL
+*
+      IF( RELTOL.GE.ZERO ) THEN
+         RELTOL = MAX( RELTOL, EPS )
+      END IF
+*
+*     ===================================================================
+*
+*     JMAX is the maximum index of the column to be factorized,
+*     which is also limited by the first stopping criterion KMAX.
+*
+      JMAX = MIN( KMAX, MINMN )
+*
+*     ===================================================================
+*
+*     Quick return if possible for the case when the second or third
+*     stopping criterion for the whole original matrix is satified,
+*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL
+*     (which is ONE <= RELTOL).
+*
+      IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN
+*
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+*
+         DO J = 1, MINMN
+            TAU( J ) = ZERO
+         END DO
+*
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*     Factorize columns
+*     ==================================================================
+*
+*     Determine the block size.
+*
+      NBMIN = 2
+      NX = 0
+*
+      IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN
+*
+*        Determine when to cross over from blocked to unblocked code.
+*        (for N less than NX, unblocked code should be used).
+*
+         NX = MAX( 0, ILAENV( IXOVER, 'SGEQP3RK', ' ', M, N, -1, -1 ))
+*
+         IF( NX.LT.MINMN ) THEN
+*
+*           Determine if workspace is large enough for blocked code.
+*
+            IF( LWORK.LT.LWKOPT ) THEN
+*
+*              Not enough workspace to use optimal block size that
+*              is currently stored in NB.
+*              Reduce NB and determine the minimum value of NB.
+*
+               NB = ( LWORK-2*N ) / ( N+1 )
+               NBMIN = MAX( 2, ILAENV( INBMIN, 'SGEQP3RK', ' ', M, N,
+     $                 -1, -1 ) )
+*
+            END IF
+         END IF
+      END IF
+*
+*     ==================================================================
+*
+*     DONE is the boolean flag to rerpresent the case when the
+*     factorization completed in the block factorization routine,
+*     before the end of the block.
+*
+      DONE = .FALSE.
+*
+*     J is the column index.
+*
+      J = 1
+*
+*     (1) Use blocked code initially.
+*
+*     JMAXB is the maximum column index of the block, when the
+*     blocked code is used, is also limited by the first stopping
+*     criterion KMAX.
+*
+      JMAXB = MIN( KMAX, MINMN - NX )
+*
+      IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN
+*
+*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here:
+*        J   is the column index of a column block;
+*        JB  is the column block size to pass to block factorization
+*            routine in a loop step;
+*        JBF is the number of columns that were actually factorized
+*            that was returned by the block factorization routine
+*            in a loop step, JBF <= JB;
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         DO WHILE( J.LE.JMAXB )
+*
+            JB = MIN( NB, JMAXB-J+1 )
+            N_SUB = N-J+1
+            IOFFSET = J-1
+*
+*           Factorize JB columns among the columns A(J:N).
+*
+            CALL SLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                     DONE, JBF, MAXC2NRMK, RELMAXC2NRMK,
+     $                     JPIV( J ), TAU( J ),
+     $                     WORK( J ), WORK( N+J ),
+     $                     WORK( 2*N+1 ), WORK( 2*N+JB+1 ),
+     $                     N+NRHS-J+1, IWORK, IINFO )
+*
+*           Set INFO on the first occurence of Inf.
+*
+            IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+               INFO = 2*IOFFSET + IINFO
+            END IF
+*
+            IF( DONE ) THEN
+*
+*              Either the submatrix is zero before the end of the
+*              column block, or ABSTOL or RELTOL criterion is
+*              satisfied before the end of the column block, we can
+*              return from the routine. Perform the following before
+*              returning:
+*                a) Set the number of factorized columns K,
+*                   K = IOFFSET + JBF from the last call of blocked
+*                   routine.
+*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned
+*                         by the block factorization routine;
+*                      2) The remaining TAUs are set to ZERO by the
+*                         block factorization routine.
+*
+               K = IOFFSET + JBF
+*
+*              Set INFO on the first occurrence of NaN, NaN takes
+*              prcedence over Inf.
+*
+               IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+                  INFO = IOFFSET + IINFO
+               END IF
+*
+*              Return from the routine.
+*
+               WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
+               RETURN
+*
+            END IF
+*
+            J = J + JBF
+*
+         END DO
+*
+      END IF
+*
+*     Use unblocked code to factor the last or only block.
+*     J = JMAX+1 means we factorized the maximum possible number of
+*     columns, that is in ELSE clause we need to compute
+*     the MAXC2NORM and RELMAXC2NORM to return after we processed
+*     the blocks.
+*
+      IF( J.LE.JMAX ) THEN
+*
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         N_SUB = N-J+1
+         IOFFSET = J-1
+*
+         CALL SLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1,
+     $                  ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                  KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ),
+     $                  TAU( J ), WORK( J ), WORK( N+J ),
+     $                  WORK( 2*N+1 ), IINFO )
+*
+*        ABSTOL or RELTOL criterion is satisfied when the number of
+*        the factorized columns KF is smaller then the  number
+*        of columns JMAX-J+1 supplied to be factorized by the
+*        unblocked routine, we can return from
+*        the routine. Perform the following before returning:
+*           a) Set the number of factorized columns K,
+*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the
+*              unblocked factorization routine above.
+*
+         K = J - 1 + KF
+*
+*        Set INFO on the first exception occurence.
+*
+*        Set INFO on the first exception occurence of Inf or NaN,
+*        (NaN takes precedence over Inf).
+*
+         IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+            INFO = 2*IOFFSET + IINFO
+         ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+            INFO = IOFFSET + IINFO
+         END IF
+*
+      ELSE
+*
+*        Compute the return values for blocked code.
+*
+*        Set the number of factorized columns if the unblocked routine
+*        was not called.
+*
+            K = JMAX
+*
+*        If there exits a residual matrix after the blocked code:
+*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the
+*              residual matrix, otherwise set them to ZERO;
+*           2) Set TAU(K+1:MINMN) to ZERO.
+*
+         IF( K.LT.MINMN ) THEN
+            JMAXC2NRM = K + ISAMAX( N-K, WORK( K+1 ), 1 )
+            MAXC2NRMK = WORK( JMAXC2NRM )
+            IF( K.EQ.0 ) THEN
+               RELMAXC2NRMK = ONE
+            ELSE
+               RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+            END IF
+*
+            DO J = K + 1, MINMN
+               TAU( J ) = ZERO
+            END DO
+*
+         END IF
+*
+*     END IF( J.LE.JMAX ) THEN
+*
+      END IF
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
+      RETURN
+*
+*     End of SGEQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/sgeqr.f b/lapack-netlib/SRC/sgeqr.f
index 6f41a92ea..79a515e1c 100644
--- a/lapack-netlib/SRC/sgeqr.f
+++ b/lapack-netlib/SRC/sgeqr.f
@@ -99,7 +99,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -168,6 +168,8 @@
 *>
 *> \endverbatim
 *>
+*> \ingroup geqr
+*>
 *  =====================================================================
       SUBROUTINE SGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
@@ -188,11 +190,13 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, LMINWS, MINT, MINW
-      INTEGER            MB, NB, MINTSZ, NBLCKS
+      INTEGER            MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLATSQR, SGEQRT, XERBLA
@@ -244,8 +248,10 @@
 *
 *     Determine if the workspace size satisfies minimal size
 *
+      LWMIN = MAX( 1, N )
+      LWREQ = MAX( 1, N*NB )
       LMINWS = .FALSE.
-      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N )
+      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ )
      $    .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ )
      $    .AND. ( .NOT.LQUERY ) ) THEN
         IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN
@@ -253,7 +259,7 @@
           NB = 1
           MB = M
         END IF
-        IF( LWORK.LT.NB*N ) THEN
+        IF( LWORK.LT.LWREQ ) THEN
           LMINWS = .TRUE.
           NB = 1
         END IF
@@ -268,7 +274,7 @@
       ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 )
      $   .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -6
-      ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY )
+      ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY )
      $   .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -8
       END IF
@@ -282,9 +288,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = MAX( 1, N )
+          WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
         ELSE
-          WORK( 1 ) = MAX( 1, NB*N )
+          WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -309,7 +315,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = MAX( 1, NB*N )
+      WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgeqrfp.f b/lapack-netlib/SRC/sgeqrfp.f
index d1ee2a828..37747c512 100644
--- a/lapack-netlib/SRC/sgeqrfp.f
+++ b/lapack-netlib/SRC/sgeqrfp.f
@@ -97,7 +97,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is
 *>          the optimal blocksize.
 *>
@@ -162,8 +163,8 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB,
-     $                   NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEQR2P, SLARFB, SLARFT, XERBLA
@@ -173,8 +174,9 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
+      EXTERNAL           ILAENV
       REAL               SROUNDUP_LWORK
-      EXTERNAL           ILAENV, SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -182,8 +184,16 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'SGEQRF', ' ', M, N, -1, -1 )
-      LWKOPT = N*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      K = MIN( M, N )
+      IF( K.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = N
+         LWKOPT = N*NB
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -191,7 +201,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
@@ -211,7 +221,7 @@
 *
       NBMIN = 2
       NX = 0
-      IWS = N
+      IWS = LWKMIN
       IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
 *        Determine when to cross over from blocked to unblocked code.
@@ -273,7 +283,7 @@
      $   CALL SGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = SROUNDUP_LWORK(IWS)
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of SGEQRFP
diff --git a/lapack-netlib/SRC/sgesvj.f b/lapack-netlib/SRC/sgesvj.f
index 3f53a5a15..36aed2853 100644
--- a/lapack-netlib/SRC/sgesvj.f
+++ b/lapack-netlib/SRC/sgesvj.f
@@ -208,7 +208,7 @@
 *>
 *> \param[in,out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (LWORK)
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
 *>          On entry,
 *>          If JOBU = 'C' :
 *>          WORK(1) = CTOL, where CTOL defines the threshold for convergence.
@@ -239,7 +239,12 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>         length of WORK, WORK >= MAX(6,M+N)
+*>          Length of WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(6,M+N), otherwise.
+*>
+*>          If on entry LWORK = -1, then a workspace query is assumed and
+*>          no computation is done; WORK(1) is set to the minial (and optimal)
+*>          length of WORK.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -260,7 +265,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup gesvj
 *
 *> \par Further Details:
 *  =====================
@@ -351,9 +356,9 @@
       INTEGER            BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1,
      $                   ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34,
      $                   N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP,
-     $                   SWBAND
-      LOGICAL            APPLV, GOSCALE, LOWER, LSVEC, NOSCALE, ROTOK,
-     $                   RSVEC, UCTOL, UPPER
+     $                   SWBAND, MINMN, LWMIN
+      LOGICAL            APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE,
+     $                   ROTOK, RSVEC, UCTOL, UPPER
 *     ..
 *     .. Local Arrays ..
       REAL               FASTR( 5 )
@@ -369,8 +374,8 @@
       INTEGER            ISAMAX
       EXTERNAL           ISAMAX
 *     from LAPACK
-      REAL               SLAMCH
-      EXTERNAL           SLAMCH
+      REAL               SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           SLAMCH, SROUNDUP_LWORK
       LOGICAL            LSAME
       EXTERNAL           LSAME
 *     ..
@@ -394,6 +399,14 @@
       UPPER = LSAME( JOBA, 'U' )
       LOWER = LSAME( JOBA, 'L' )
 *
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 6, M+N )
+      END IF
+*
+      LQUERY = ( LWORK.EQ.-1 )
       IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN
          INFO = -1
       ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN
@@ -413,7 +426,7 @@
          INFO = -11
       ELSE IF( UCTOL .AND. ( WORK( 1 ).LE.ONE ) ) THEN
          INFO = -12
-      ELSE IF( LWORK.LT.MAX( M+N, 6 ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN
          INFO = -13
       ELSE
          INFO = 0
@@ -423,11 +436,14 @@
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SGESVJ', -INFO )
          RETURN
+      ELSE IF( LQUERY ) THEN
+         WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
+         RETURN
       END IF
 *
 * #:) Quick return for void matrix
 *
-      IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN
+      IF( MINMN.EQ.0 ) RETURN
 *
 *     Set numerical parameters
 *     The stopping criterion for Jacobi rotations is
diff --git a/lapack-netlib/SRC/sgetri.f b/lapack-netlib/SRC/sgetri.f
index fe71bc4a5..7b06bb63d 100644
--- a/lapack-netlib/SRC/sgetri.f
+++ b/lapack-netlib/SRC/sgetri.f
@@ -137,8 +137,9 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
+      EXTERNAL           ILAENV
       REAL               SROUNDUP_LWORK
-      EXTERNAL           ILAENV, SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMM, SGEMV, SSWAP, STRSM, STRTRI, XERBLA
@@ -152,8 +153,9 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'SGETRI', ' ', N, -1, -1, -1 )
-      LWKOPT = N*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      LWKOPT = MAX( 1, N*NB )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
@@ -251,7 +253,7 @@
      $      CALL SSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 )
    60 CONTINUE
 *
-      WORK( 1 ) = SROUNDUP_LWORK(IWS)
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of SGETRI
diff --git a/lapack-netlib/SRC/sgetsls.f b/lapack-netlib/SRC/sgetsls.f
index d89c6a4e6..08a427a8b 100644
--- a/lapack-netlib/SRC/sgetsls.f
+++ b/lapack-netlib/SRC/sgetsls.f
@@ -127,7 +127,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed.
 *>          If LWORK = -1, the routine calculates optimal size of WORK for the
 *>          optimal performance and returns this value in WORK(1).
@@ -226,7 +226,10 @@
 *
 *     Determine the optimum and minimum LWORK
 *
-       IF( M.GE.N ) THEN
+       IF( MIN( M, N, NRHS ).EQ.0 ) THEN
+         WSIZEO = 1
+         WSIZEM = 1
+       ELSE IF( M.GE.N ) THEN
          CALL SGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 )
          TSZO = INT( TQ( 1 ) )
          LWO  = INT( WORKQ( 1 ) )
diff --git a/lapack-netlib/SRC/sgetsqrhrt.f b/lapack-netlib/SRC/sgetsqrhrt.f
index d80ff4da8..7ade8a66c 100644
--- a/lapack-netlib/SRC/sgetsqrhrt.f
+++ b/lapack-netlib/SRC/sgetsqrhrt.f
@@ -130,14 +130,17 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          If MIN(M,N) = 0, LWORK >= 1, else
+*>          LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
-*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ).
+*>
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
@@ -216,7 +219,7 @@
 *     Test the input arguments
 *
       INFO = 0
-      LQUERY  = LWORK.EQ.-1
+      LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
       ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
@@ -229,7 +232,7 @@
          INFO = -5
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -7
-      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN
          INFO = -9
       ELSE
 *
@@ -267,8 +270,9 @@
             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
             LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+            LWORKOPT = MAX( 1, LWORKOPT )
 *
-            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+            IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN
                INFO = -11
             END IF
 *
@@ -350,4 +354,4 @@
 *
 *     End of SGETSQRHRT
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/SRC/sgges3.f b/lapack-netlib/SRC/sgges3.f
index e35d4955a..e90cd6947 100644
--- a/lapack-netlib/SRC/sgges3.f
+++ b/lapack-netlib/SRC/sgges3.f
@@ -234,6 +234,8 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If N = 0, LWORK >= 1, else LWORK >= 6*N+16.
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -309,7 +311,8 @@
       LOGICAL            CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL,
      $                   LQUERY, LST2SL, WANTST
       INTEGER            I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT,
-     $                   ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT
+     $                   ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT,
+     $                   LWKMIN
       REAL               ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL,
      $                   PVSR, SAFMAX, SAFMIN, SMLNUM
 *     ..
@@ -361,6 +364,12 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      IF( N.EQ.0 ) THEN
+         LWKMIN = 1
+      ELSE
+         LWKMIN = 6*N+16
+      END IF
+*
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -377,7 +386,7 @@
          INFO = -15
       ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN
          INFO = -17
-      ELSE IF( LWORK.LT.6*N+16 .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -19
       END IF
 *
@@ -385,7 +394,7 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL SGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( 6*N+16, 3*N+INT( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) )
          CALL SORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
          LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
@@ -407,7 +416,11 @@
      $                   IERR )
             LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
          END IF
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -421,6 +434,7 @@
 *
       IF( N.EQ.0 ) THEN
          SDIM = 0
+         WORK( 1 ) = 1
          RETURN
       END IF
 *
@@ -657,7 +671,7 @@
 *
    40 CONTINUE
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sggev3.f b/lapack-netlib/SRC/sggev3.f
index c82d2187f..d788d1147 100644
--- a/lapack-netlib/SRC/sggev3.f
+++ b/lapack-netlib/SRC/sggev3.f
@@ -189,6 +189,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
+*>          The dimension of the array WORK. LWORK >= MAX(1,8*N).
+*>          For good performance, LWORK should generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -248,7 +250,8 @@
       LOGICAL            ILASCL, ILBSCL, ILV, ILVL, ILVR, LQUERY
       CHARACTER          CHTEMP
       INTEGER            ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO,
-     $                   IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT
+     $                   IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT,
+     $                   LWKMIN
       REAL               ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS,
      $                   SMLNUM, TEMP
 *     ..
@@ -298,6 +301,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 8*N )
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -312,7 +316,7 @@
          INFO = -12
       ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN
          INFO = -14
-      ELSE IF( LWORK.LT.MAX( 1, 8*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -16
       END IF
 *
@@ -320,28 +324,31 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL SGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( 1, 8*N, 3*N+INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) )
          CALL SORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
-         LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          CALL SGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL, LDVL,
      $                VR, LDVR, WORK, -1, IERR )
-         LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          IF( ILVL ) THEN
             CALL SORGQR( N, N, N, VL, LDVL, WORK, WORK, -1, IERR )
-            LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
             CALL SLAQZ0( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
      $                   ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR,
      $                   WORK, -1, 0, IERR )
-            LWKOPT = MAX( LWKOPT, 2*N+INT ( WORK( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
          ELSE
             CALL SLAQZ0( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
      $                   ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR,
      $                   WORK, -1, 0, IERR )
-            LWKOPT = MAX( LWKOPT, 2*N+INT ( WORK( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
+         END IF
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          END IF
-         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
-*
       END IF
 *
       IF( INFO.NE.0 ) THEN
diff --git a/lapack-netlib/SRC/sgghd3.f b/lapack-netlib/SRC/sgghd3.f
index 9c5858b5a..01e57088a 100644
--- a/lapack-netlib/SRC/sgghd3.f
+++ b/lapack-netlib/SRC/sgghd3.f
@@ -179,14 +179,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (LWORK)
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
-*> \param[in]  LWORK
+*> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= 1.
+*>          The length of the array WORK. LWORK >= 1.
 *>          For optimum performance LWORK >= 6*N*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -276,7 +276,12 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'SGGHD3', ' ', N, ILO, IHI, -1 )
-      LWKOPT = MAX( 6*N*NB, 1 )
+      NH = IHI - ILO + 1
+      IF( NH.LE.1 ) THEN
+         LWKOPT = 1
+      ELSE
+         LWKOPT = 6*N*NB
+      END IF
       WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       INITQ = LSAME( COMPQ, 'I' )
       WANTQ = INITQ .OR. LSAME( COMPQ, 'V' )
@@ -326,7 +331,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = ONE
          RETURN
@@ -886,6 +890,7 @@
       IF ( JCOL.LT.IHI )
      $   CALL SGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q,
      $                LDQ, Z, LDZ, IERR )
+*
       WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
diff --git a/lapack-netlib/SRC/sggqrf.f b/lapack-netlib/SRC/sggqrf.f
index ebb42a899..d32b48410 100644
--- a/lapack-netlib/SRC/sggqrf.f
+++ b/lapack-netlib/SRC/sggqrf.f
@@ -236,8 +236,9 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
+      EXTERNAL           ILAENV
       REAL               SROUNDUP_LWORK
-      EXTERNAL           ILAENV, SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -251,8 +252,9 @@
       NB2 = ILAENV( 1, 'SGERQF', ' ', N, P, -1, -1 )
       NB3 = ILAENV( 1, 'SORMQR', ' ', N, M, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P )*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
@@ -289,6 +291,7 @@
 *
       CALL SGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO )
       LWKOPT = MAX( LOPT, INT( WORK( 1 ) ) )
+*
       WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
diff --git a/lapack-netlib/SRC/sggrqf.f b/lapack-netlib/SRC/sggrqf.f
index 2163f1ef8..b3842ec2a 100644
--- a/lapack-netlib/SRC/sggrqf.f
+++ b/lapack-netlib/SRC/sggrqf.f
@@ -250,7 +250,7 @@
       NB2 = ILAENV( 1, 'SGEQRF', ' ', P, N, -1, -1 )
       NB3 = ILAENV( 1, 'SORMRQ', ' ', M, N, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P)*NB
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
       WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/sggsvd3.f b/lapack-netlib/SRC/sggsvd3.f
index 053fff5de..cee630593 100644
--- a/lapack-netlib/SRC/sggsvd3.f
+++ b/lapack-netlib/SRC/sggsvd3.f
@@ -278,7 +278,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/lapack-netlib/SRC/sggsvp3.f b/lapack-netlib/SRC/sggsvp3.f
index a463b9064..8e90d770c 100644
--- a/lapack-netlib/SRC/sggsvp3.f
+++ b/lapack-netlib/SRC/sggsvp3.f
@@ -227,7 +227,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -300,8 +300,9 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
+      EXTERNAL           LSAME
       REAL               SROUNDUP_LWORK
-      EXTERNAL           LSAME, SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEQP3, SGEQR2, SGERQ2, SLACPY, SLAPMT,
diff --git a/lapack-netlib/SRC/slamswlq.f b/lapack-netlib/SRC/slamswlq.f
index d4996b1f2..432afaded 100644
--- a/lapack-netlib/SRC/slamswlq.f
+++ b/lapack-netlib/SRC/slamswlq.f
@@ -127,17 +127,20 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          If SIDE = 'L', LWORK >= max(1,NB) * MB;
-*>          if SIDE = 'R', LWORK >= max(1,M) * MB.
+*>
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,NB*MB).
+*>          If SIDE = 'R', LWORK >= max(1,M*MB).
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -189,33 +192,38 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamswlq
+*>
 *  =====================================================================
       SUBROUTINE SLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $    LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      REAL              A( LDA, * ), WORK( * ), C(LDC, * ),
-     $      T( LDT, * )
+      REAL               A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           STPMLQT, SGEMLQT, XERBLA
 *     ..
@@ -223,52 +231,60 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * MB
       ELSE
         LW = M * MB
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
 *
       INFO = 0
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( K.LT.0 ) THEN
         INFO = -5
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
         INFO = -4
-      ELSE IF( K.LT.MB .OR. MB.LT.1) THEN
+      ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN
         INFO = -6
       ELSE IF( LDA.LT.MAX( 1, K ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, MB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
+      END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'SLAMSWLQ', -INFO )
-        WORK(1) = LW
         RETURN
-      ELSE IF (LQUERY) THEN
-        WORK(1) = LW
+      ELSE IF( LQUERY ) THEN
         RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -402,7 +418,7 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of SLAMSWLQ
diff --git a/lapack-netlib/SRC/slamtsqr.f b/lapack-netlib/SRC/slamtsqr.f
index 960b794de..f9b167aea 100644
--- a/lapack-netlib/SRC/slamtsqr.f
+++ b/lapack-netlib/SRC/slamtsqr.f
@@ -128,22 +128,24 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) REAL array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,N*NB).
+*>          If SIDE = 'R', LWORK >= max(1,MB*NB).
 *>
-*>          If SIDE = 'L', LWORK >= max(1,N)*NB;
-*>          if SIDE = 'R', LWORK >= max(1,MB)*NB.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -191,33 +193,38 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamtsqr
+*>
 *  =====================================================================
       SUBROUTINE SLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $        LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      REAL              A( LDA, * ), WORK( * ), C(LDC, * ),
-     $                T( LDT, * )
+      REAL               A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR, Q
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, Q, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMQRT, STPMQRT, XERBLA
 *     ..
@@ -225,12 +232,13 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      INFO = 0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * NB
         Q = M
       ELSE
@@ -238,11 +246,17 @@
         Q = N
       END IF
 *
-      INFO = 0
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
+*
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
@@ -253,38 +267,38 @@
         INFO = -7
       ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, NB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN. AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
-*     Determine the block size if it is tall skinny or short and wide
-*
-      IF( INFO.EQ.0)  THEN
-          WORK(1) = LW
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'SLAMTSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
+*
+*     Determine the block size if it is tall skinny or short and wide
 *
       IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN
         CALL SGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA,
-     $        T, LDT, C, LDC, WORK, INFO)
+     $        T, LDT, C, LDC, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
       IF(LEFT.AND.NOTRAN) THEN
 *
@@ -410,7 +424,7 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of SLAMTSQR
diff --git a/lapack-netlib/SRC/slaqp2rk.c b/lapack-netlib/SRC/slaqp2rk.c
new file mode 100644
index 000000000..0bfa71ab9
--- /dev/null
+++ b/lapack-netlib/SRC/slaqp2rk.c
@@ -0,0 +1,918 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+
+/* Subroutine */ int slaqp2rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *kmax, real *abstol, real *reltol, integer *kp1, 
+	real *maxc2nrm, real *a, integer *lda, integer *k, real *maxc2nrmk, 
+	real *relmaxc2nrmk, integer *jpiv, real *tau, real *vn1, real *vn2, 
+	real *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    real r__1, r__2;
+
+    /* Local variables */
+    real aikk, temp, temp2;
+    extern real snrm2_(integer *, real *, integer *);
+    integer i__, j;
+    real tol3z;
+    integer jmaxc2nrm;
+    extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, 
+	    integer *, real *, real *, integer *, real *);
+    integer itemp, minmnfact;
+    extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, 
+	    integer *);
+    real myhugeval;
+    integer minmnupdt, kk, kp;
+    extern real slamch_(char *);
+    extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, 
+	    real *);
+    extern integer isamax_(integer *, real *, integer *);
+    extern logical sisnan_(real *);
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/*     MINMNUPDT is the smallest dimension */
+/*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which */
+/*     contains the submatrices A(IOFFSET+1:M,1:N) and */
+/*     B(IOFFSET+1:M,1:NRHS) as column blocks. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *kmax = f2cmin(*kmax,minmnfact);
+    tol3z = sqrt(slamch_("Epsilon"));
+    myhugeval = slamch_("Overflow");
+
+/*     Compute the factorization, KK is the lomn loop index. */
+
+    i__1 = *kmax;
+    for (kk = 1; kk <= i__1; ++kk) {
+
+	i__ = *ioffset + kk;
+
+	if (i__ == 1) {
+
+/*           ============================================================ */
+
+/*           We are at the first column of the original whole matrix A, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+/*           ============================================================ */
+
+	} else {
+
+/*           ============================================================ */
+
+/*           Determine the pivot column in KK-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__2 = *n - kk + 1;
+	    kp = kk - 1 + isamax_(&i__2, &vn1[kk], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,KK:N) in step KK. */
+/*           RELMAXC2NRMK  will be computed later, after somecondition */
+/*           checks on MAXC2NRMK. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains NaN, and set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (sisnan_(maxc2nrmk)) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*info = *k + kp;
+
+/*               Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*               Array TAU(K+1:MINMNFACT) is not set and contains */
+/*               undefined elements. */
+
+		return 0;
+	    }
+
+/*           ============================================================ */
+
+/*           Quick return, if the submatrix A(I:M,KK:N) is */
+/*           a zero matrix. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*maxc2nrmk == 0.f) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*relmaxc2nrmk = 0.f;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    tau[j] = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + kk - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL >= ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+/*              Set K, the number of factorized columns. */
+
+		*k = kk - 1;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    tau[j] = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,KK:N): */
+/*        1) swap the KK-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) copy the KK-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than KK in the next loop step.) */
+/*        3) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A, not the block A(1:M,1:N). */
+
+	if (kp != kk) {
+	    sswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1);
+	    vn1[kp] = vn1[kk];
+	    vn2[kp] = vn2[kk];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[kk];
+	    jpiv[kk] = itemp;
+	}
+
+/*        Generate elementary reflector H(KK) using the column A(I:M,KK), */
+/*        if the column has more than one element, otherwise */
+/*        the elementary reflector would be an identity matrix, */
+/*        and TAU(KK) = ZERO. */
+
+	if (i__ < *m) {
+	    i__2 = *m - i__ + 1;
+	    slarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], &
+		    c__1, &tau[kk]);
+	} else {
+	    tau[kk] = 0.f;
+	}
+
+/*        Check if TAU(KK) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(KK) for Inf, */
+/*        since SLARFG cannot produce TAU(KK) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by SLARFG can contain Inf, which requires */
+/*        TAU(KK) to contain NaN. Therefore, this case of generating Inf */
+/*        by SLARFG is covered by checking TAU(KK) for NaN. */
+
+	if (sisnan_(&tau[kk])) {
+	    *k = kk - 1;
+	    *info = kk;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = tau[kk];
+	    *relmaxc2nrmk = tau[kk];
+
+/*           Array TAU(KK:MINMNFACT) is not set and contains */
+/*           undefined elements, except the first element TAU(KK) = NaN. */
+
+	    return 0;
+	}
+
+/*        Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left. */
+/*        ( If M >= N, then at KK = N there is no residual matrix, */
+/*         i.e. no columns of A to update, only columns of B. */
+/*         If M < N, then at KK = M-IOFFSET, I = M and we have a */
+/*         one-row residual matrix in A and the elementary */
+/*         reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update */
+/*         is needed for the residual matrix in A and the */
+/*         right-hand-side-matrix in B. */
+/*         Therefore, we update only if */
+/*         KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */
+/*         condition is satisfied, not only KK < N+NRHS ) */
+
+	if (kk < minmnupdt) {
+	    aikk = a[i__ + kk * a_dim1];
+	    a[i__ + kk * a_dim1] = 1.f;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n + *nrhs - kk;
+	    slarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &tau[
+		    kk], &a[i__ + (kk + 1) * a_dim1], lda, &work[1]);
+	    a[i__ + kk * a_dim1] = aikk;
+	}
+
+	if (kk < minmnfact) {
+
+/*           Update the partial column 2-norms for the residual matrix, */
+/*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */
+/*           when KK < f2cmin(M-IOFFSET, N). */
+
+	    i__2 = *n;
+	    for (j = kk + 1; j <= i__2; ++j) {
+		if (vn1[j] != 0.f) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+/* Computing 2nd power */
+		    r__2 = (r__1 = a[i__ + j * a_dim1], abs(r__1)) / vn1[j];
+		    temp = 1.f - r__2 * r__2;
+		    temp = f2cmax(temp,0.f);
+/* Computing 2nd power */
+		    r__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (r__1 * r__1);
+		    if (temp2 <= tol3z) {
+
+/*                    Compute the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by explicitly computing it, */
+/*                    and store it in both partial 2-norm vector VN1 */
+/*                    and exact column 2-norm vector VN2. */
+
+			i__3 = *m - i__;
+			vn1[j] = snrm2_(&i__3, &a[i__ + 1 + j * a_dim1], &
+				c__1);
+			vn2[j] = vn1[j];
+
+		    } else {
+
+/*                    Update the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by removing one */
+/*                    element A(I,J) and store it in partial */
+/*                    2-norm vector VN1. */
+
+			vn1[j] *= sqrt(temp);
+
+		    }
+		}
+	    }
+
+	}
+
+/*     End factorization loop */
+
+    }
+
+/*     If we reached this point, all colunms have been factorized, */
+/*     i.e. no condition was triggered to exit the routine. */
+/*     Set the number of factorized columns. */
+
+    *k = *kmax;
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */
+/*     we return. */
+
+    if (*k < minmnfact) {
+
+	i__1 = *n - *k;
+	jmaxc2nrm = *k + isamax_(&i__1, &vn1[*k + 1], &c__1);
+	*maxc2nrmk = vn1[jmaxc2nrm];
+
+	if (*k == 0) {
+	    *relmaxc2nrmk = 1.f;
+	} else {
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+	}
+
+    } else {
+	*maxc2nrmk = 0.f;
+	*relmaxc2nrmk = 0.f;
+    }
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, set TAUs corresponding to the columns that were */
+/*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO. */
+
+    i__1 = minmnfact;
+    for (j = *k + 1; j <= i__1; ++j) {
+	tau[j] = 0.f;
+    }
+
+    return 0;
+
+/*     End of SLAQP2RK */
+
+} /* slaqp2rk_ */
+
diff --git a/lapack-netlib/SRC/slaqp2rk.f b/lapack-netlib/SRC/slaqp2rk.f
new file mode 100644
index 000000000..f88b0ce90
--- /dev/null
+++ b/lapack-netlib/SRC/slaqp2rk.f
@@ -0,0 +1,713 @@
+*> \brief \b SLAQP2RK computes truncated QR factorization with column pivoting of a real matrix block using Level 2 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SLAQP2RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slaqp2rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slaqp2rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slaqp2rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE SLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+*     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+*     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+*     $                     INFO )
+*      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+*      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+*      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            JPIV( * )
+*      REAL               A( LDA, * ), TAU( * ), VN1( * ), VN2( * ),
+*     $                   WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLAQP2RK computes a truncated (rank K) or full rank Householder QR
+*> factorization with column pivoting of a real matrix
+*> block A(IOFFSET+1:M,1:N) as
+*>
+*>   A * P(K) = Q(K) * R(K).
+*>
+*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N)
+*> is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides matrix block B
+*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**T * B.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M-IOFFSET,N), then this stopping
+*>                criterion is not used, factorize columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>             satisfied on input and the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The second factorization stopping criterion.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The third factorization stopping criterion.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine SGEQP3RK. 1 <= KP1 <= N_orig_mat.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is DOUBLE PRECISION
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine SGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:K) below
+*>             the diagonal together with the array TAU represent
+*>             the orthogonal matrix Q(K) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:K) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,K+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(K)**T.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is REAL array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is REAL array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is REAL array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (N-1)
+*>          Used in SLARF subroutine to apply an elementary
+*>          reflector from the left.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp2rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+     $                     INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            JPIV( * )
+      REAL               A( LDA, * ), TAU( * ), VN1( * ), VN2( * ),
+     $                   WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT,
+     $                   MINMNUPDT
+      REAL               AIKK, HUGEVAL, TEMP, TEMP2, TOL3Z
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLARF, SLARFG, SSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      INTEGER            ISAMAX
+      REAL               SLAMCH, SNRM2
+      EXTERNAL           SISNAN, SLAMCH, ISAMAX, SNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+*     MINMNUPDT is the smallest dimension
+*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which
+*     contains the submatrices A(IOFFSET+1:M,1:N) and
+*     B(IOFFSET+1:M,1:NRHS) as column blocks.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      KMAX = MIN( KMAX, MINMNFACT )
+      TOL3Z = SQRT( SLAMCH( 'Epsilon' ) )
+      HUGEVAL = SLAMCH( 'Overflow' )
+*
+*     Compute the factorization, KK is the lomn loop index.
+*
+      DO KK = 1, KMAX
+*
+         I = IOFFSET + KK
+*
+         IF( I.EQ.1 ) THEN
+*
+*           ============================================================
+*
+*           We are at the first column of the original whole matrix A,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+
+            KP = KP1
+*
+*           ============================================================
+*
+         ELSE
+*
+*           ============================================================
+*
+*           Determine the pivot column in KK-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( KK-1 ) + ISAMAX( N-KK+1, VN1( KK ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,KK:N) in step KK.
+*           RELMAXC2NRMK  will be computed later, after somecondition
+*           checks on MAXC2NRMK.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains NaN, and set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( SISNAN( MAXC2NRMK ) ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+                K = KK - 1
+                INFO = K + KP
+*
+*               Set RELMAXC2NRMK to NaN.
+*
+                RELMAXC2NRMK = MAXC2NRMK
+*
+*               Array TAU(K+1:MINMNFACT) is not set and contains
+*               undefined elements.
+*
+               RETURN
+            END IF
+*
+*           ============================================================
+*
+*           Quick return, if the submatrix A(I:M,KK:N) is
+*           a zero matrix.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+               K = KK - 1
+               RELMAXC2NRMK = ZERO
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + KK - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third stopping criteria.
+*           NOTE: There is no need to test for ABSTOL >= ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+*              Set K, the number of factorized columns.
+*
+               K = KK - 1
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,KK:N):
+*        1) swap the KK-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) copy the KK-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than KK in the next loop step.)
+*        3) Save the pivot interchange with the indices relative to the
+*           the original matrix A, not the block A(1:M,1:N).
+*
+         IF( KP.NE.KK ) THEN
+            CALL SSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 )
+            VN1( KP ) = VN1( KK )
+            VN2( KP ) = VN2( KK )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( KK )
+            JPIV( KK ) = ITEMP
+         END IF
+*
+*        Generate elementary reflector H(KK) using the column A(I:M,KK),
+*        if the column has more than one element, otherwise
+*        the elementary reflector would be an identity matrix,
+*        and TAU(KK) = ZERO.
+*
+         IF( I.LT.M ) THEN
+            CALL SLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1,
+     $                   TAU( KK ) )
+         ELSE
+            TAU( KK ) = ZERO
+         END IF
+*
+*        Check if TAU(KK) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(KK) for Inf,
+*        since SLARFG cannot produce TAU(KK) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by SLARFG can contain Inf, which requires
+*        TAU(KK) to contain NaN. Therefore, this case of generating Inf
+*        by SLARFG is covered by checking TAU(KK) for NaN.
+*
+         IF( SISNAN( TAU(KK) ) ) THEN
+            K = KK - 1
+            INFO = KK
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAU( KK )
+            RELMAXC2NRMK = TAU( KK )
+*
+*           Array TAU(KK:MINMNFACT) is not set and contains
+*           undefined elements, except the first element TAU(KK) = NaN.
+*
+            RETURN
+         END IF
+*
+*        Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left.
+*        ( If M >= N, then at KK = N there is no residual matrix,
+*         i.e. no columns of A to update, only columns of B.
+*         If M < N, then at KK = M-IOFFSET, I = M and we have a
+*         one-row residual matrix in A and the elementary
+*         reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update
+*         is needed for the residual matrix in A and the
+*         right-hand-side-matrix in B.
+*         Therefore, we update only if
+*         KK < MINMNUPDT = min(M-IOFFSET, N+NRHS)
+*         condition is satisfied, not only KK < N+NRHS )
+*
+         IF( KK.LT.MINMNUPDT ) THEN
+            AIKK = A( I, KK )
+            A( I, KK ) = ONE
+            CALL SLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1,
+     $                  TAU( KK ), A( I, KK+1 ), LDA, WORK( 1 ) )
+            A( I, KK ) = AIKK
+         END IF
+*
+         IF( KK.LT.MINMNFACT ) THEN
+*
+*           Update the partial column 2-norms for the residual matrix,
+*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e.
+*           when KK < min(M-IOFFSET, N).
+*
+            DO J = KK + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2
+                  TEMP = MAX( TEMP, ZERO )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2 .LE. TOL3Z ) THEN
+*
+*                    Compute the column 2-norm for the partial
+*                    column A(I+1:M,J) by explicitly computing it,
+*                    and store it in both partial 2-norm vector VN1
+*                    and exact column 2-norm vector VN2.
+*
+                     VN1( J ) = SNRM2( M-I, A( I+1, J ), 1 )
+                     VN2( J ) = VN1( J )
+*
+                  ELSE
+*
+*                    Update the column 2-norm for the partial
+*                    column A(I+1:M,J) by removing one
+*                    element A(I,J) and store it in partial
+*                    2-norm vector VN1.
+*
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+*
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*     End factorization loop
+*
+      END DO
+*
+*     If we reached this point, all colunms have been factorized,
+*     i.e. no condition was triggered to exit the routine.
+*     Set the number of factorized columns.
+*
+      K = KMAX
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before
+*     we return.
+*
+      IF( K.LT.MINMNFACT ) THEN
+*
+         JMAXC2NRM = K + ISAMAX( N-K, VN1( K+1 ), 1 )
+         MAXC2NRMK = VN1( JMAXC2NRM )
+*
+         IF( K.EQ.0 ) THEN
+            RELMAXC2NRMK = ONE
+         ELSE
+            RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+         END IF
+*
+      ELSE
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+      END IF
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, set TAUs corresponding to the columns that were
+*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO.
+*
+      DO J = K + 1, MINMNFACT
+         TAU( J ) = ZERO
+      END DO
+*
+      RETURN
+*
+*     End of SLAQP2RK
+*
+      END
diff --git a/lapack-netlib/SRC/slaqp3rk.c b/lapack-netlib/SRC/slaqp3rk.c
new file mode 100644
index 000000000..e3632538b
--- /dev/null
+++ b/lapack-netlib/SRC/slaqp3rk.c
@@ -0,0 +1,1109 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static real c_b7 = -1.f;
+static real c_b8 = 1.f;
+static real c_b30 = 0.f;
+
+/* Subroutine */ int slaqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *nb, real *abstol, real *reltol, integer *kp1, real 
+	*maxc2nrm, real *a, integer *lda, logical *done, integer *kb, real *
+	maxc2nrmk, real *relmaxc2nrmk, integer *jpiv, real *tau, real *vn1, 
+	real *vn2, real *auxv, real *f, integer *ldf, integer *iwork, integer 
+	*info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, f_dim1, f_offset, i__1, i__2;
+    real r__1, r__2;
+
+    /* Local variables */
+    real temp, temp2;
+    extern real snrm2_(integer *, real *, integer *);
+    integer i__, j, k;
+    real tol3z;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, 
+	    integer *, real *, real *, integer *, real *, integer *, real *, 
+	    real *, integer *);
+    integer itemp;
+    extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, 
+	    real *, integer *, real *, integer *, real *, real *, integer *);
+    integer minmnfact;
+    extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, 
+	    integer *);
+    real myhugeval;
+    integer minmnupdt, if__, kp;
+    extern real slamch_(char *);
+    extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, 
+	    real *);
+    integer lsticc;
+    extern integer isamax_(integer *, real *, integer *);
+    extern logical sisnan_(real *);
+    real aik;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --auxv;
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *nb = f2cmin(*nb,minmnfact);
+    tol3z = sqrt(slamch_("Epsilon"));
+    myhugeval = slamch_("Overflow");
+
+/*     Compute factorization in a while loop over NB columns, */
+/*     K is the column index in the block A(1:M,1:N). */
+
+    k = 0;
+    lsticc = 0;
+    *done = FALSE_;
+
+    while(k < *nb && lsticc == 0) {
+	++k;
+	i__ = *ioffset + k;
+
+	if (i__ == 1) {
+
+/*           We are at the first column of the original whole matrix A_orig, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+	} else {
+
+/*           Determine the pivot column in K-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__1 = *n - k + 1;
+	    kp = k - 1 + isamax_(&i__1, &vn1[k], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,K:N) in step K. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains NaN, set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (sisnan_(maxc2nrmk)) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*info = *kb + kp;
+
+/*              Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix contains NaN and we stop */
+/*              the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    sgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &
+			    c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + 
+			    f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*              undefined elements. */
+
+/*              Return from the routine. */
+
+		return 0;
+	    }
+
+/*           Quick return, if the submatrix A(I:M,K:N) is */
+/*           a zero matrix. We need to check it only if the column index */
+/*           (same as row index) is larger than 1, since the condition */
+/*           for the whole original matrix A_orig is checked in the main */
+/*           routine. */
+
+	    if (*maxc2nrmk == 0.f) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*relmaxc2nrmk = 0.f;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix is zero and we stop the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */
+
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    sgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &
+			    c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + 
+			    f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    tau[j] = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + k - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third tolerance stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig; */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+
+/*              Apply the block reflector to the residual of the */
+/*              matrix A and the residual of the right hand sides B, if */
+/*              the residual matrix and and/or the residual of the right */
+/*              hand sides exist,  i.e. if the submatrix */
+/*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when */
+/*                 KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */
+
+		if (*kb < minmnupdt) {
+		    i__1 = *m - if__;
+		    i__2 = *n + *nrhs - *kb;
+		    sgemm_("No transpose", "Transpose", &i__1, &i__2, kb, &
+			    c_b7, &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + 
+			    f_dim1], ldf, &c_b8, &a[if__ + 1 + (*kb + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    tau[j] = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,K:N): */
+/*        1) swap the K-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */
+/*        3) copy the K-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than K in the next loop step.) */
+/*        4) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A_orig, not the block A(1:M,1:N). */
+
+	if (kp != k) {
+	    sswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+	    i__1 = k - 1;
+	    sswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf);
+	    vn1[kp] = vn1[k];
+	    vn2[kp] = vn2[k];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[k];
+	    jpiv[k] = itemp;
+	}
+
+/*        Apply previous Householder reflectors to column K: */
+/*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T. */
+
+	if (k > 1) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    sgemv_("No transpose", &i__1, &i__2, &c_b7, &a[i__ + a_dim1], lda,
+		     &f[k + f_dim1], ldf, &c_b8, &a[i__ + k * a_dim1], &c__1);
+	}
+
+/*        Generate elementary reflector H(k) using the column A(I:M,K). */
+
+	if (i__ < *m) {
+	    i__1 = *m - i__ + 1;
+	    slarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], &
+		    c__1, &tau[k]);
+	} else {
+	    tau[k] = 0.f;
+	}
+
+/*        Check if TAU(K) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(K) for Inf, */
+/*        since SLARFG cannot produce TAU(K) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by SLARFG can contain Inf, which requires */
+/*        TAU(K) to contain NaN. Therefore, this case of generating Inf */
+/*        by SLARFG is covered by checking TAU(K) for NaN. */
+
+	if (sisnan_(&tau[k])) {
+
+	    *done = TRUE_;
+
+/*           Set KB, the number of factorized partial columns */
+/*                   that are non-zero in each step in the block, */
+/*                   i.e. the rank of the factor R. */
+/*           Set IF, the number of processed rows in the block, which */
+/*                   is the same as the number of processed rows in */
+/*                   the original whole matrix A_orig. */
+
+	    *kb = k - 1;
+	    if__ = i__ - 1;
+	    *info = k;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = tau[k];
+	    *relmaxc2nrmk = tau[k];
+
+/*           There is no need to apply the block reflector to the */
+/*           residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*           since the submatrix contains NaN and we stop */
+/*           the computation. */
+/*           But, we need to apply the block reflector to the residual */
+/*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*           residual right hand sides exist.  This occurs */
+/*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */
+
+	    if (*nrhs > 0 && *kb < *m - *ioffset) {
+		i__1 = *m - if__;
+		sgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &c_b7, &
+			a[if__ + 1 + a_dim1], lda, &f[*n + 1 + f_dim1], ldf, &
+			c_b8, &a[if__ + 1 + (*n + 1) * a_dim1], lda);
+	    }
+
+/*           There is no need to recompute the 2-norm of the */
+/*           difficult columns, since we stop the factorization. */
+
+/*           Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*           undefined elements. */
+
+/*           Return from the routine. */
+
+	    return 0;
+	}
+
+/*        =============================================================== */
+
+	aik = a[i__ + k * a_dim1];
+	a[i__ + k * a_dim1] = 1.f;
+
+/*        =============================================================== */
+
+/*        Compute the current K-th column of F: */
+/*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K). */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = *n + *nrhs - k;
+	    sgemv_("Transpose", &i__1, &i__2, &tau[k], &a[i__ + (k + 1) * 
+		    a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b30, &f[k + 
+		    1 + k * f_dim1], &c__1);
+	}
+
+/*           2) Zero out elements above and on the diagonal of the */
+/*              column K in matrix F, i.e elements F(1:K,K). */
+
+	i__1 = k;
+	for (j = 1; j <= i__1; ++j) {
+	    f[j + k * f_dim1] = 0.f;
+	}
+
+/*         3) Incremental updating of the K-th column of F: */
+/*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T */
+/*                    * A(I:M,K). */
+
+	if (k > 1) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    r__1 = -tau[k];
+	    sgemv_("Transpose", &i__1, &i__2, &r__1, &a[i__ + a_dim1], lda, &
+		    a[i__ + k * a_dim1], &c__1, &c_b30, &auxv[1], &c__1);
+
+	    i__1 = *n + *nrhs;
+	    i__2 = k - 1;
+	    sgemv_("No transpose", &i__1, &i__2, &c_b8, &f[f_dim1 + 1], ldf, &
+		    auxv[1], &c__1, &c_b8, &f[k * f_dim1 + 1], &c__1);
+	}
+
+/*        =============================================================== */
+
+/*        Update the current I-th row of A: */
+/*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */
+/*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**T. */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *n + *nrhs - k;
+	    sgemv_("No transpose", &i__1, &k, &c_b7, &f[k + 1 + f_dim1], ldf, 
+		    &a[i__ + a_dim1], lda, &c_b8, &a[i__ + (k + 1) * a_dim1], 
+		    lda);
+	}
+
+	a[i__ + k * a_dim1] = aik;
+
+/*        Update the partial column 2-norms for the residual matrix, */
+/*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */
+/*        when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */
+
+	if (k < minmnfact) {
+
+	    i__1 = *n;
+	    for (j = k + 1; j <= i__1; ++j) {
+		if (vn1[j] != 0.f) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+		    temp = (r__1 = a[i__ + j * a_dim1], abs(r__1)) / vn1[j];
+/* Computing MAX */
+		    r__1 = 0.f, r__2 = (temp + 1.f) * (1.f - temp);
+		    temp = f2cmax(r__1,r__2);
+/* Computing 2nd power */
+		    r__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (r__1 * r__1);
+		    if (temp2 <= tol3z) {
+
+/*                    At J-index, we have a difficult column for the */
+/*                    update of the 2-norm. Save the index of the previous */
+/*                    difficult column in IWORK(J-1). */
+/*                    NOTE: ILSTCC > 1, threfore we can use IWORK only */
+/*                    with N-1 elements, where the elements are */
+/*                    shifted by 1 to the left. */
+
+			iwork[j - 1] = lsticc;
+
+/*                    Set the index of the last difficult column LSTICC. */
+
+			lsticc = j;
+
+		    } else {
+			vn1[j] *= sqrt(temp);
+		    }
+		}
+	    }
+
+	}
+
+/*        End of while loop. */
+
+    }
+
+/*     Now, afler the loop: */
+/*        Set KB, the number of factorized columns in the block; */
+/*        Set IF, the number of processed rows in the block, which */
+/*                is the same as the number of processed rows in */
+/*                the original whole matrix A_orig, IF = IOFFSET + KB. */
+
+    *kb = k;
+    if__ = i__;
+
+/*     Apply the block reflector to the residual of the matrix A */
+/*     and the residual of the right hand sides B, if the residual */
+/*     matrix and and/or the residual of the right hand sides */
+/*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */
+/*     This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */
+
+    if (*kb < minmnupdt) {
+	i__1 = *m - if__;
+	i__2 = *n + *nrhs - *kb;
+	sgemm_("No transpose", "Transpose", &i__1, &i__2, kb, &c_b7, &a[if__ 
+		+ 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b8, &a[if__ 
+		+ 1 + (*kb + 1) * a_dim1], lda);
+    }
+
+/*     Recompute the 2-norm of the difficult columns. */
+/*     Loop over the index of the difficult columns from the largest */
+/*     to the smallest index. */
+
+    while(lsticc > 0) {
+
+/*        LSTICC is the index of the last difficult column is greater */
+/*        than 1. */
+/*        ITEMP is the index of the previous difficult column. */
+
+	itemp = iwork[lsticc - 1];
+
+/*        Compute the 2-norm explicilty for the last difficult column and */
+/*        save it in the partial and exact 2-norm vectors VN1 and VN2. */
+
+/*        NOTE: The computation of VN1( LSTICC ) relies on the fact that */
+/*        SNRM2 does not fail on vectors with norm below the value of */
+/*        SQRT(SLAMCH('S')) */
+
+	i__1 = *m - if__;
+	vn1[lsticc] = snrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1);
+	vn2[lsticc] = vn1[lsticc];
+
+/*        Downdate the index of the last difficult column to */
+/*        the index of the previous difficult column. */
+
+	lsticc = itemp;
+
+    }
+
+    return 0;
+
+/*     End of SLAQP3RK */
+
+} /* slaqp3rk_ */
+
diff --git a/lapack-netlib/SRC/slaqp3rk.f b/lapack-netlib/SRC/slaqp3rk.f
new file mode 100644
index 000000000..b2dc2b334
--- /dev/null
+++ b/lapack-netlib/SRC/slaqp3rk.f
@@ -0,0 +1,935 @@
+*> \brief \b SLAQP3RK computes a step of truncated QR factorization with column pivoting of a real m-by-n matrix A using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SLAQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slaqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slaqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slaqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE SLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+*     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+*     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+*      IMPLICIT NONE
+*      LOGICAL            DONE
+*      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+*     $                   NB, NRHS
+*      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*
+*     .. Scalar Arguments ..
+*      LOGICAL            DONE
+*      INTEGER            KB, LDA, LDF, M, N, NB, NRHS, IOFFSET
+*      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            IWORK( * ), JPIV( * )
+*      REAL               A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ),
+*     $                   VN1( * ), VN2( * )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLAQP3RK computes a step of truncated QR factorization with column
+*> pivoting of a real M-by-N matrix A block A(IOFFSET+1:M,1:N)
+*> by using Level 3 BLAS as
+*>
+*>   A * P(KB) = Q(KB) * R(KB).
+*>
+*> The routine tries to factorize NB columns from A starting from
+*> the row IOFFSET+1 and updates the residual matrix with BLAS 3
+*> xGEMM. The number of actually factorized columns is returned
+*> is smaller than NB.
+*>
+*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides B matrix stored
+*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**T * B.
+*>
+*> Cases when the number of factorized columns KB < NB:
+*>
+*> (1) In some cases, due to catastrophic cancellations, it cannot
+*> factorize all NB columns and need to update the residual matrix.
+*> Hence, the actual number of factorized columns in the block returned
+*> in KB is smaller than NB. The logical DONE is returned as FALSE.
+*> The factorization of the whole original matrix A_orig must proceed
+*> with the next block.
+*>
+*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB.
+*>
+*> (3) In case both stopping criteria ABSTOL or RELTOL are not used,
+*> and when the residual matrix is a zero matrix in some factorization
+*> step KB, the factorization of the whole original matrix A_orig is
+*> stopped, the logical DONE is returned as TRUE. The number of
+*> factorized columns which is smaller than NB is returned in KB.
+*>
+*> (4) Whenever NaN is detected in the matrix A or in the array TAU,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB. The INFO
+*> parameter is set to the column index of the first NaN occurrence.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          Factorization block size, i.e the number of columns
+*>          to factorize in the matrix A. 0 <= NB
+*>
+*>          If NB = 0, then the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is REAL, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine SGEQP3RK. 1 <= KP1 <= N_orig.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is REAL
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine SGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:KB) below
+*>             the diagonal together with the array TAU represent
+*>             the orthogonal matrix Q(KB) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:KB) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,KB+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(KB)**T.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out]
+*> \verbatim
+*>          DONE is LOGICAL
+*>          TRUE: a) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to ABSTOL
+*>                   or RELTOL criterion,
+*>                b) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to the
+*>                   residual matrix being a ZERO matrix.
+*>                c) when NaN was detected in the matrix A
+*>                   or in the array TAU.
+*>          FALSE: otherwise.
+*> \endverbatim
+*>
+*> \param[out] KB
+*> \verbatim
+*>          KB is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R.  0 <= KB <= min(M-IOFFSET,NB,N).
+*>
+*>          KB also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is REAL
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank KB. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is REAL
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank KB) to the maximum column 2-norm of the
+*>          original matrix A_orig. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is REAL array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is REAL array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is REAL array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] AUXV
+*> \verbatim
+*>          AUXV is REAL array, dimension (NB)
+*>          Auxiliary vector.
+*> \endverbatim
+*>
+*> \param[out] F
+*> \verbatim
+*>          F is REAL array, dimension (LDF,NB)
+*>          Matrix F**T = L*(Y**T)*A.
+*> \endverbatim
+*>
+*> \param[in] LDF
+*> \verbatim
+*>          LDF is INTEGER
+*>          The leading dimension of the array F. LDF >= max(1,N+NRHS).
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step KB+1 ( when KB columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             KB                  is set to the number of
+*>                                    factorized columns without
+*>                                    exception.
+*>             MAXC2NRMK           is set to NaN.
+*>             RELMAXC2NRMK        is set to NaN.
+*>             TAU(KB+1:min(M,N))     is not set and contains undefined
+*>                                    elements. If j_1=KB+1, TAU(KB+1)
+*>                                    may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the actorization
+*>             step KB+1 ( when KB columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp3rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      LOGICAL            DONE
+      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+     $                   NB, NRHS
+      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      REAL               A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ),
+     $                   VN1( * ), VN2( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            ITEMP, J, K, MINMNFACT, MINMNUPDT,
+     $                   LSTICC, KP, I, IF
+      REAL               AIK, HUGEVAL, TEMP, TEMP2, TOL3Z
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SGEMM, SGEMV, SLARFG, SSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      INTEGER            ISAMAX
+      REAL               SLAMCH, SNRM2
+      EXTERNAL           SISNAN, SLAMCH, ISAMAX, SNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      NB = MIN( NB, MINMNFACT )
+      TOL3Z = SQRT( SLAMCH( 'Epsilon' ) )
+      HUGEVAL = SLAMCH( 'Overflow' )
+*
+*     Compute factorization in a while loop over NB columns,
+*     K is the column index in the block A(1:M,1:N).
+*
+      K = 0
+      LSTICC = 0
+      DONE = .FALSE.
+*
+      DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 )
+         K = K + 1
+         I = IOFFSET + K
+*
+         IF( I.EQ.1 ) THEN
+*
+*           We are at the first column of the original whole matrix A_orig,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+            KP = KP1
+*
+         ELSE
+*
+*           Determine the pivot column in K-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( K-1 ) + ISAMAX( N-K+1, VN1( K ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,K:N) in step K.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains NaN, set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( SISNAN( MAXC2NRMK ) ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               INFO = KB + KP
+*
+*              Set RELMAXC2NRMK to NaN.
+*
+               RELMAXC2NRMK = MAXC2NRMK
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix contains NaN and we stop
+*              the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T.
+
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL SGEMM( 'No transpose', 'Transpose',
+     $                  M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Array TAU(KF+1:MINMNFACT) is not set and contains
+*              undefined elements.
+*
+*              Return from the routine.
+*
+               RETURN
+            END IF
+*
+*           Quick return, if the submatrix A(I:M,K:N) is
+*           a zero matrix. We need to check it only if the column index
+*           (same as row index) is larger than 1, since the condition
+*           for the whole original matrix A_orig is checked in the main
+*           routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               RELMAXC2NRMK = ZERO
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix is zero and we stop the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T.
+*
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL SGEMM( 'No transpose', 'Transpose',
+     $                  M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + K - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third tolerance stopping criteria.
+*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig;
+*
+                  KB = K - 1
+                  IF = I - 1
+*
+*              Apply the block reflector to the residual of the
+*              matrix A and the residual of the right hand sides B, if
+*              the residual matrix and and/or the residual of the right
+*              hand sides exist,  i.e. if the submatrix
+*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when
+*                 KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T.
+*
+               IF( KB.LT.MINMNUPDT ) THEN
+                  CALL SGEMM( 'No transpose', 'Transpose',
+     $                  M-IF, N+NRHS-KB, KB,-ONE, A( IF+1, 1 ), LDA,
+     $                  F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,K:N):
+*        1) swap the K-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1)
+*        3) copy the K-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than K in the next loop step.)
+*        4) Save the pivot interchange with the indices relative to the
+*           the original matrix A_orig, not the block A(1:M,1:N).
+*
+         IF( KP.NE.K ) THEN
+            CALL SSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 )
+            CALL SSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF )
+            VN1( KP ) = VN1( K )
+            VN2( KP ) = VN2( K )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( K )
+            JPIV( K ) = ITEMP
+         END IF
+*
+*        Apply previous Householder reflectors to column K:
+*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T.
+*
+         IF( K.GT.1 ) THEN
+            CALL SGEMV( 'No transpose', M-I+1, K-1, -ONE, A( I, 1 ),
+     $                  LDA, F( K, 1 ), LDF, ONE, A( I, K ), 1 )
+         END IF
+*
+*        Generate elementary reflector H(k) using the column A(I:M,K).
+*
+         IF( I.LT.M ) THEN
+            CALL SLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) )
+         ELSE
+            TAU( K ) = ZERO
+         END IF
+*
+*        Check if TAU(K) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(K) for Inf,
+*        since SLARFG cannot produce TAU(K) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by SLARFG can contain Inf, which requires
+*        TAU(K) to contain NaN. Therefore, this case of generating Inf
+*        by SLARFG is covered by checking TAU(K) for NaN.
+*
+         IF( SISNAN( TAU(K) ) ) THEN
+*
+            DONE = .TRUE.
+*
+*           Set KB, the number of factorized partial columns
+*                   that are non-zero in each step in the block,
+*                   i.e. the rank of the factor R.
+*           Set IF, the number of processed rows in the block, which
+*                   is the same as the number of processed rows in
+*                   the original whole matrix A_orig.
+*
+            KB = K - 1
+            IF = I - 1
+            INFO = K
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAU( K )
+            RELMAXC2NRMK = TAU( K )
+*
+*           There is no need to apply the block reflector to the
+*           residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*           since the submatrix contains NaN and we stop
+*           the computation.
+*           But, we need to apply the block reflector to the residual
+*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*           residual right hand sides exist.  This occurs
+*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T.
+*
+            IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+               CALL SGEMM( 'No transpose', 'Transpose',
+     $               M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA,
+     $               F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA )
+            END IF
+*
+*           There is no need to recompute the 2-norm of the
+*           difficult columns, since we stop the factorization.
+*
+*           Array TAU(KF+1:MINMNFACT) is not set and contains
+*           undefined elements.
+*
+*           Return from the routine.
+*
+            RETURN
+         END IF
+*
+*        ===============================================================
+*
+         AIK = A( I, K )
+         A( I, K ) = ONE
+*
+*        ===============================================================
+*
+*        Compute the current K-th column of F:
+*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K).
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL SGEMV( 'Transpose', M-I+1, N+NRHS-K,
+     $                  TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1,
+     $                  ZERO, F( K+1, K ), 1 )
+         END IF
+*
+*           2) Zero out elements above and on the diagonal of the
+*              column K in matrix F, i.e elements F(1:K,K).
+*
+         DO J = 1, K
+            F( J, K ) = ZERO
+         END DO
+*
+*         3) Incremental updating of the K-th column of F:
+*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T
+*                    * A(I:M,K).
+*
+         IF( K.GT.1 ) THEN
+            CALL SGEMV( 'Transpose', M-I+1, K-1, -TAU( K ),
+     $                  A( I, 1 ), LDA, A( I, K ), 1, ZERO,
+     $                  AUXV( 1 ), 1 )
+*
+            CALL SGEMV( 'No transpose', N+NRHS, K-1, ONE,
+     $                  F( 1, 1 ), LDF, AUXV( 1 ), 1, ONE,
+     $                  F( 1, K ), 1 )
+         END IF
+*
+*        ===============================================================
+*
+*        Update the current I-th row of A:
+*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS)
+*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**T.
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL SGEMV( 'No transpose', N+NRHS-K, K, -ONE,
+     $                  F( K+1, 1 ), LDF, A( I, 1 ), LDA, ONE,
+     $                  A( I, K+1 ), LDA )
+         END IF
+*
+         A( I, K ) = AIK
+*
+*        Update the partial column 2-norms for the residual matrix,
+*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e.
+*        when K < MINMNFACT = min( M-IOFFSET, N ).
+*
+         IF( K.LT.MINMNFACT ) THEN
+*
+            DO J = K + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ABS( A( I, J ) ) / VN1( J )
+                  TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2.LE.TOL3Z ) THEN
+*
+*                    At J-index, we have a difficult column for the
+*                    update of the 2-norm. Save the index of the previous
+*                    difficult column in IWORK(J-1).
+*                    NOTE: ILSTCC > 1, threfore we can use IWORK only
+*                    with N-1 elements, where the elements are
+*                    shifted by 1 to the left.
+*
+                     IWORK( J-1 ) = LSTICC
+*
+*                    Set the index of the last difficult column LSTICC.
+*
+                     LSTICC = J
+*
+                  ELSE
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*        End of while loop.
+*
+      END DO
+*
+*     Now, afler the loop:
+*        Set KB, the number of factorized columns in the block;
+*        Set IF, the number of processed rows in the block, which
+*                is the same as the number of processed rows in
+*                the original whole matrix A_orig, IF = IOFFSET + KB.
+*
+      KB = K
+      IF = I
+*
+*     Apply the block reflector to the residual of the matrix A
+*     and the residual of the right hand sides B, if the residual
+*     matrix and and/or the residual of the right hand sides
+*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists.
+*     This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T.
+*
+      IF( KB.LT.MINMNUPDT ) THEN
+         CALL SGEMM( 'No transpose', 'Transpose',
+     $         M-IF, N+NRHS-KB, KB, -ONE, A( IF+1, 1 ), LDA,
+     $         F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA )
+      END IF
+*
+*     Recompute the 2-norm of the difficult columns.
+*     Loop over the index of the difficult columns from the largest
+*     to the smallest index.
+*
+      DO WHILE( LSTICC.GT.0 )
+*
+*        LSTICC is the index of the last difficult column is greater
+*        than 1.
+*        ITEMP is the index of the previous difficult column.
+*
+         ITEMP = IWORK( LSTICC-1 )
+*
+*        Compute the 2-norm explicilty for the last difficult column and
+*        save it in the partial and exact 2-norm vectors VN1 and VN2.
+*
+*        NOTE: The computation of VN1( LSTICC ) relies on the fact that
+*        SNRM2 does not fail on vectors with norm below the value of
+*        SQRT(SLAMCH('S'))
+*
+         VN1( LSTICC ) = SNRM2( M-IF, A( IF+1, LSTICC ), 1 )
+         VN2( LSTICC ) = VN1( LSTICC )
+*
+*        Downdate the index of the last difficult column to
+*        the index of the previous difficult column.
+*
+         LSTICC = ITEMP
+*
+      END DO
+*
+      RETURN
+*
+*     End of SLAQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/slaswlq.f b/lapack-netlib/SRC/slaswlq.f
index 685f823a0..594c646db 100644
--- a/lapack-netlib/SRC/slaswlq.f
+++ b/lapack-netlib/SRC/slaswlq.f
@@ -96,22 +96,24 @@
 *>          The leading dimension of the array T.  LDT >= MB.
 *> \endverbatim
 *>
-*>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) REAL array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= MB * M.
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -163,32 +165,35 @@
 *>
 *  =====================================================================
       SUBROUTINE SLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
-     $                  INFO)
+     $                    INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LWORK, LDT
+      INTEGER            INFO, LDA, M, N, MB, NB, LWORK, LDT
 *     ..
 *     .. Array Arguments ..
-      REAL              A( LDA, * ), WORK( * ), T( LDT, *)
+      REAL               A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, MINMN, LWMIN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
+      EXTERNAL           LSAME
       REAL               SROUNDUP_LWORK
-      EXTERNAL           LSAME, SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
       EXTERNAL           SGELQT, SGEQRT, STPLQT, STPQRT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -199,12 +204,19 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = M*MB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
       ELSE IF( N.LT.0 .OR. N.LT.M ) THEN
         INFO = -2
-      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN
+      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN
         INFO = -3
       ELSE IF( NB.LE.0 ) THEN
         INFO = -4
@@ -212,60 +224,60 @@
         INFO = -6
       ELSE IF( LDT.LT.MB ) THEN
         INFO = -8
-      ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-      WORK(1) = MB*M
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'SLASWLQ', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The LQ Decomposition
 *
-       IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN
-        CALL SGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO)
+      IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN
+        CALL SGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
-       KK = MOD((N-M),(NB-M))
-       II=N-KK+1
+      KK = MOD((N-M),(NB-M))
+      II = N-KK+1
 *
-*      Compute the LQ factorization of the first block A(1:M,1:NB)
+*     Compute the LQ factorization of the first block A(1:M,1:NB)
 *
-       CALL SGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO)
-       CTR = 1
+      CALL SGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CTR = 1
 *
-       DO I = NB+1, II-NB+M , (NB-M)
+      DO I = NB+1, II-NB+M, (NB-M)
 *
-*      Compute the QR factorization of the current block A(1:M,I:I+NB-M)
+*       Compute the QR factorization of the current block A(1:M,I:I+NB-M)
 *
-         CALL STPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
-     $                  LDA, T(1, CTR * M + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CALL STPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
+     $                 LDA, T(1, CTR * M + 1),
+     $                 LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
 *     Compute the QR factorization of the last block A(1:M,II:N)
 *
-       IF (II.LE.N) THEN
+      IF( II.LE.N ) THEN
         CALL STPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ),
-     $                  LDA, T(1, CTR * M + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+     $                 LDA, T(1, CTR * M + 1), LDT,
+     $                 WORK, INFO )
+      END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(M * MB)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of SLASWLQ
diff --git a/lapack-netlib/SRC/slatrs3.f b/lapack-netlib/SRC/slatrs3.f
index 8f0c4bf16..17052289e 100644
--- a/lapack-netlib/SRC/slatrs3.f
+++ b/lapack-netlib/SRC/slatrs3.f
@@ -151,13 +151,16 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (LWORK).
-*>          On exit, if INFO = 0, WORK(1) returns the optimal size of
-*>          WORK.
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
+*> \verbatim
 *>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else
 *>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
 *>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
 *>
@@ -165,6 +168,7 @@
 *>          only calculates the optimal dimensions of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
@@ -181,7 +185,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERauxiliary
+*> \ingroup latrs3
 *> \par Further Details:
 *  =====================
 *  \verbatim
@@ -253,7 +257,7 @@
       LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
       INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
      $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
-     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
+     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN
       REAL               ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
      $                   SCAMIN, SMLNUM, TMAX
 *     ..
@@ -264,7 +268,8 @@
       EXTERNAL           ILAENV, LSAME, SLAMCH, SLANGE, SLARMM
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLATRS, SSCAL, XERBLA
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SLATRS, SSCAL, SROUNDUP_LWORK, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN
@@ -292,15 +297,24 @@
 *     row. WORK( I + KK * LDS ) is the scale factor of the vector
 *     segment associated with the I-th block row and the KK-th vector
 *     in the block column.
+*
       LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
       LDS = NBA
+*
 *     The second part stores upper bounds of the triangular A. There are
 *     a total of NBA x NBA blocks, of which only the upper triangular
 *     part or the lower triangular part is referenced. The upper bound of
 *     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
+*
       LANRM = NBA * NBA
       AWRK = LSCALE
-      WORK( 1 ) = LSCALE + LANRM
+*
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = LSCALE + LANRM
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
 *     Test the input parameters.
 *
@@ -322,7 +336,7 @@
          INFO = -8
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -10
-      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
+      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN
          INFO = -14
       END IF
       IF( INFO.NE.0 ) THEN
@@ -650,6 +664,8 @@
          END DO
       END DO
       RETURN
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
 *     End of SLATRS3
 *
diff --git a/lapack-netlib/SRC/slatsqr.f b/lapack-netlib/SRC/slatsqr.f
index 86733bb15..4730815b5 100644
--- a/lapack-netlib/SRC/slatsqr.f
+++ b/lapack-netlib/SRC/slatsqr.f
@@ -101,15 +101,18 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= NB*N.
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -161,33 +164,39 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup latsqr
+*>
 *  =====================================================================
       SUBROUTINE SLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK,
-     $                    LWORK, INFO)
+     $                    LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LDT, LWORK
+      INTEGER            INFO, LDA, M, N, MB, NB, LDT, LWORK
 *     ..
 *     .. Array Arguments ..
-      REAL  A( LDA, * ), WORK( * ), T(LDT, *)
+      REAL               A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, MINMN, LWMIN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
       EXTERNAL           SGEQRT, STPQRT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -198,6 +207,13 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = N*NB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
@@ -205,64 +221,65 @@
         INFO = -2
       ELSE IF( MB.LT.1 ) THEN
         INFO = -3
-      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN
+      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN
         INFO = -4
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
       ELSE IF( LDT.LT.NB ) THEN
         INFO = -8
-      ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-        WORK(1) = NB*N
+*
+      IF( INFO.EQ.0 )  THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'SLATSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The QR Decomposition
 *
-       IF ((MB.LE.N).OR.(MB.GE.M)) THEN
-         CALL SGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO)
-         RETURN
-       END IF
-       KK = MOD((M-N),(MB-N))
-       II=M-KK+1
+      IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN
+        CALL SGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO )
+        RETURN
+      END IF
+      KK = MOD((M-N),(MB-N))
+      II = M-KK+1
 *
-*      Compute the QR factorization of the first block A(1:MB,1:N)
+*     Compute the QR factorization of the first block A(1:MB,1:N)
 *
-       CALL SGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CALL SGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
 *
-       CTR = 1
-       DO I = MB+1, II-MB+N ,  (MB-N)
+      CTR = 1
+      DO I = MB+1, II-MB+N, (MB-N)
 *
-*      Compute the QR factorization of the current block A(I:I+MB-N,1:N)
+*       Compute the QR factorization of the current block A(I:I+MB-N,1:N)
 *
-         CALL STPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
-     $                 T(1, CTR * N + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CALL STPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
+     $                T(1, CTR * N + 1),
+     $                LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
-*      Compute the QR factorization of the last block A(II:M,1:N)
+*     Compute the QR factorization of the last block A(II:M,1:N)
 *
-       IF (II.LE.M) THEN
-         CALL STPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
-     $                 T(1, CTR * N + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+      IF( II.LE.M ) THEN
+        CALL STPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
+     $                T(1, CTR * N + 1), LDT,
+     $                WORK, INFO )
+      END IF
 *
-      work( 1 ) = N*NB
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of SLATSQR
diff --git a/lapack-netlib/SRC/ssyevd.f b/lapack-netlib/SRC/ssyevd.f
index a5e4638d6..2ae44fc81 100644
--- a/lapack-netlib/SRC/ssyevd.f
+++ b/lapack-netlib/SRC/ssyevd.f
@@ -96,8 +96,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array,
-*>                                         dimension (LWORK)
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -251,7 +250,7 @@
      $                  N*ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) )
             LIOPT = LIWMIN
          END IF
-         WORK( 1 ) = SROUNDUP_LWORK(LOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LOPT )
          IWORK( 1 ) = LIOPT
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -335,7 +334,7 @@
       IF( ISCALE.EQ.1 )
      $   CALL SSCAL( N, ONE / SIGMA, W, 1 )
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LOPT )
       IWORK( 1 ) = LIOPT
 *
       RETURN
diff --git a/lapack-netlib/SRC/ssyevr.f b/lapack-netlib/SRC/ssyevr.f
index 47e4d7cbf..870facd60 100644
--- a/lapack-netlib/SRC/ssyevr.f
+++ b/lapack-netlib/SRC/ssyevr.f
@@ -271,7 +271,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,26*N).
+*>          The dimension of the array WORK.
+*>          If N <= 1, LWORK >= 1, else LWORK >= 26*N.
 *>          For optimal efficiency, LWORK >= (NB+6)*N,
 *>          where NB is the max of the blocksize for SSYTRD and SORMTR
 *>          returned by ILAENV.
@@ -292,7 +293,8 @@
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK and
@@ -392,8 +394,13 @@
 *
       LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) )
 *
-      LWMIN = MAX( 1, 26*N )
-      LIWMIN = MAX( 1, 10*N )
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = 26*N
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN
@@ -428,7 +435,7 @@
          NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
          NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) )
          LWKOPT = MAX( ( NB+1 )*N, LWMIN )
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -677,7 +684,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/ssyevr_2stage.f b/lapack-netlib/SRC/ssyevr_2stage.f
index a2d6a6231..471e25977 100644
--- a/lapack-netlib/SRC/ssyevr_2stage.f
+++ b/lapack-netlib/SRC/ssyevr_2stage.f
@@ -278,6 +278,7 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If N <= 1,               LWORK must be at least 1.
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 26*N, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + 5*N
@@ -300,13 +301,14 @@
 *> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
-*>          On exit, if INFO = 0, IWORK(1) returns the optimal LWORK.
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
 *> \endverbatim
 *>
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the IWORK array,
@@ -445,8 +447,14 @@
       IB     = ILAENV2STAGE( 2, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
       LHTRD  = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
       LWTRD  = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
-      LWMIN  = MAX( 26*N, 5*N + LHTRD + LWTRD )
-      LIWMIN = MAX( 1, 10*N )
+*
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = MAX( 26*N, 5*N + LHTRD + LWTRD )
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
@@ -485,7 +493,7 @@
 *         NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
 *         NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) )
 *         LWKOPT = MAX( ( NB+1 )*N, LWMIN )
-         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+         WORK( 1 )  = SROUNDUP_LWORK( LWMIN )
          IWORK( 1 ) = LIWMIN
       END IF
 *
@@ -505,7 +513,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 26
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = A( 1, 1 )
@@ -733,7 +741,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+      WORK( 1 )  = SROUNDUP_LWORK( LWMIN )
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/ssyevx.f b/lapack-netlib/SRC/ssyevx.f
index 2204aa39b..aaed6dad5 100644
--- a/lapack-netlib/SRC/ssyevx.f
+++ b/lapack-netlib/SRC/ssyevx.f
@@ -338,14 +338,14 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWKMIN = 1
-            WORK( 1 ) = SROUNDUP_LWORK(LWKMIN)
+            LWKOPT = 1
          ELSE
             LWKMIN = 8*N
             NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
             NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) )
             LWKOPT = MAX( LWKMIN, ( NB + 3 )*N )
-            WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY )
      $      INFO = -17
@@ -542,7 +542,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssysv_aa.f b/lapack-netlib/SRC/ssysv_aa.f
index e43d4de7f..711a275e1 100644
--- a/lapack-netlib/SRC/ssysv_aa.f
+++ b/lapack-netlib/SRC/ssysv_aa.f
@@ -177,12 +177,13 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS
+      INTEGER            LWKMIN, LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
+      EXTERNAL           LSAME
       REAL               SROUNDUP_LWORK
-      EXTERNAL           LSAME, SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, SSYTRS_AA, SSYTRF_AA
@@ -196,6 +197,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N, 3*N-2 )
       IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -206,18 +208,18 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-         LWKOPT_SYTRF = INT( WORK(1) )
+         LWKOPT_SYTRF = INT( WORK( 1 ) )
          CALL SSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
      $                   -1, INFO )
-         LWKOPT_SYTRS = INT( WORK(1) )
-         LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS )
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         LWKOPT_SYTRS = INT( WORK( 1 ) )
+         LWKOPT = MAX( LWKMIN, LWKOPT_SYTRF, LWKOPT_SYTRS )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -239,7 +241,7 @@
 *
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f
index 3d88e068e..fb068b3bf 100644
--- a/lapack-netlib/SRC/ssysv_aa_2stage.f
+++ b/lapack-netlib/SRC/ssysv_aa_2stage.f
@@ -100,14 +100,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is REAL array, dimension (LTB)
+*>          TB is REAL array, dimension (MAX(1,LTB))
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -147,14 +147,15 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL workspace of size LWORK
+*>          WORK is REAL workspace of size (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used to
+*>          select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -204,12 +205,13 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            UPPER, TQUERY, WQUERY
-      INTEGER            LWKOPT
+      INTEGER            LWKMIN, LWKOPT
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
+      EXTERNAL           LSAME
       REAL               SROUNDUP_LWORK
-      EXTERNAL           LSAME, SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SSYTRF_AA_2STAGE, SSYTRS_AA_2STAGE,
@@ -226,6 +228,7 @@
       UPPER = LSAME( UPLO, 'U' )
       WQUERY = ( LWORK.EQ.-1 )
       TQUERY = ( LTB.EQ.-1 )
+      LWKMIN = MAX( 1, N )
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -234,18 +237,19 @@
          INFO = -3
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -5
-      ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -7
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -11
-      ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN
          INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV,
      $                          IPIV2, WORK, -1, INFO )
-         LWKOPT = INT( WORK(1) )
+         LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -255,7 +259,6 @@
          RETURN
       END IF
 *
-*
 *     Compute the factorization A = U**T*T*U or A = L*T*L**T.
 *
       CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2,
@@ -269,7 +272,7 @@
 *
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssysvx.f b/lapack-netlib/SRC/ssysvx.f
index 0d72217eb..06a6413f1 100644
--- a/lapack-netlib/SRC/ssysvx.f
+++ b/lapack-netlib/SRC/ssysvx.f
@@ -305,7 +305,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, NOFACT
-      INTEGER            LWKOPT, NB
+      INTEGER            LWKMIN, LWKOPT, NB
       REAL               ANORM
 *     ..
 *     .. External Functions ..
@@ -327,6 +327,7 @@
       INFO = 0
       NOFACT = LSAME( FACT, 'N' )
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 3*N )
       IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN
          INFO = -1
       ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) )
@@ -344,12 +345,12 @@
          INFO = -11
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -13
-      ELSE IF( LWORK.LT.MAX( 1, 3*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -18
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = MAX( 1, 3*N )
+         LWKOPT = LWKMIN
          IF( NOFACT ) THEN
             NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 )
             LWKOPT = MAX( LWKOPT, N*NB )
diff --git a/lapack-netlib/SRC/ssytrd_2stage.f b/lapack-netlib/SRC/ssytrd_2stage.f
index 5d70ae0d4..5b401c3d0 100644
--- a/lapack-netlib/SRC/ssytrd_2stage.f
+++ b/lapack-netlib/SRC/ssytrd_2stage.f
@@ -4,23 +4,23 @@
 *
 *  =========== DOCUMENTATION ===========
 *
-* Online html documentation available at 
-*            http://www.netlib.org/lapack/explore-html/ 
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
-*> Download SSYTRD_2STAGE + dependencies 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssytrd_2stage.f"> 
-*> [TGZ]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssytrd_2stage.f"> 
-*> [ZIP]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssytrd_2stage.f"> 
+*> Download SSYTRD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssytrd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssytrd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssytrd_2stage.f">
 *> [TXT]</a>
-*> \endhtmlonly 
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+*       SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
 *                                 HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
 *       IMPLICIT NONE
@@ -34,7 +34,7 @@
 *       REAL               A( LDA, * ), TAU( * ),
 *                          HOUS2( * ), WORK( * )
 *       ..
-*  
+*
 *
 *> \par Purpose:
 *  =============
@@ -52,11 +52,11 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  in particular for the second stage (Band to
 *>                  tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate Q1 Q2 or to apply Q1 Q2, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate Q1 Q2 or to apply Q1 Q2,
 *>                  then LHOUS2 is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -86,7 +86,7 @@
 *>          triangular part of A is not referenced.
 *>          On exit, if UPLO = 'U', the band superdiagonal
 *>          of A are overwritten by the corresponding elements of the
-*>          internal band-diagonal matrix AB, and the elements above 
+*>          internal band-diagonal matrix AB, and the elements above
 *>          the KD superdiagonal, with the array TAU, represent the orthogonal
 *>          matrix Q1 as a product of elementary reflectors; if UPLO
 *>          = 'L', the diagonal and band subdiagonal of A are over-
@@ -117,13 +117,13 @@
 *> \param[out] TAU
 *> \verbatim
 *>          TAU is REAL array, dimension (N-KD)
-*>          The scalar factors of the elementary reflectors of 
+*>          The scalar factors of the elementary reflectors of
 *>          the first stage (see Further Details).
 *> \endverbatim
 *>
 *> \param[out] HOUS2
 *> \verbatim
-*>          HOUS2 is REAL array, dimension (LHOUS2)
+*>          HOUS2 is REAL array, dimension (MAX(1,LHOUS2))
 *>          Stores the Householder representation of the stage2
 *>          band to tridiagonal.
 *> \endverbatim
@@ -132,6 +132,8 @@
 *> \verbatim
 *>          LHOUS2 is INTEGER
 *>          The dimension of the array HOUS2.
+*>          LHOUS2 >= 1.
+*>
 *>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS2 array, returns
@@ -149,17 +151,19 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS2=-1,
+*>          The dimension of the array WORK.
+*>          If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *>          LWORK = MAX(1, dimension) where
 *>          dimension   = max(stage1,stage2) + (KD+1)*N
-*>                      = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                        + max(2*KD*KD, KD*NTHREADS) 
-*>                        + (KD+1)*N 
+*>                      = N*KD + N*max(KD+1,FACTOPTNB)
+*>                        + max(2*KD*KD, KD*NTHREADS)
+*>                        + (KD+1)*N
 *>          where KD is the blocking size of the reduction,
 *>          FACTOPTNB is the blocking used by the QR or LQ
 *>          algorithm, usually FACTOPTNB=128 is a good choice
@@ -177,12 +181,12 @@
 *  Authors:
 *  ========
 *
-*> \author Univ. of Tennessee 
-*> \author Univ. of California Berkeley 
-*> \author Univ. of Colorado Denver 
-*> \author NAG Ltd. 
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetrd_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -202,7 +206,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -210,16 +214,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+      SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
      $                          HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
       IMPLICIT NONE
@@ -265,10 +269,13 @@
 *
       KD     = ILAENV2STAGE( 1, 'SSYTRD_2STAGE', VECT, N, -1, -1, -1 )
       IB     = ILAENV2STAGE( 2, 'SSYTRD_2STAGE', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 )
-*      WRITE(*,*),'SSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
-*     $            LHMIN, LWMIN
+      IF( N.EQ.0 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.LSAME( VECT, 'N' ) ) THEN
          INFO = -1
@@ -309,14 +316,14 @@
       LWRK  = LWORK-LDAB*N
       ABPOS = 1
       WPOS  = ABPOS + LDAB*N
-      CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, 
+      CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB,
      $                   TAU, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SSYTRD_SY2SB', -INFO )
          RETURN
       END IF
-      CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, 
-     $                   WORK( ABPOS ), LDAB, D, E, 
+      CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD,
+     $                   WORK( ABPOS ), LDAB, D, E,
      $                   HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SSYTRD_SB2ST', -INFO )
@@ -324,8 +331,7 @@
       END IF
 *
 *
-      HOUS2( 1 ) = LHMIN
-      WORK( 1 )  = LWMIN
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of SSYTRD_2STAGE
diff --git a/lapack-netlib/SRC/ssytrd_sb2st.F b/lapack-netlib/SRC/ssytrd_sb2st.F
index 32bae26dc..111eaa93e 100644
--- a/lapack-netlib/SRC/ssytrd_sb2st.F
+++ b/lapack-netlib/SRC/ssytrd_sb2st.F
@@ -132,15 +132,17 @@
 *>
 *> \param[out] HOUS
 *> \verbatim
-*>          HOUS is REAL array, dimension LHOUS, that
-*>          store the Householder representation.
+*>          HOUS is REAL array, dimension (MAX(1,LHOUS))
+*>          Stores the Householder representation.
 *> \endverbatim
 *>
 *> \param[in] LHOUS
 *> \verbatim
 *>          LHOUS is INTEGER
-*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array HOUS.
+*>          If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension)
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS array, returns
 *>          this value as the first entry of the HOUS array, and no error
@@ -152,14 +154,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension LWORK.
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array WORK.
+*>          IF N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension)
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -261,7 +266,7 @@
       INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
      $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
      $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
-     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
+     $                   NBTILES, TTYPE, TID, NTHREADS,
      $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
      $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
      $                   SISEV, SIZETAU, LDV, LHMIN, LWMIN
@@ -283,7 +288,6 @@
 *     Determine the minimal workspace size required.
 *     Test the input parameters
 *
-      DEBUG   = 0
       INFO    = 0
       AFTERS1 = LSAME( STAGE1, 'Y' )
       WANTQ   = LSAME( VECT, 'V' )
@@ -292,9 +296,14 @@
 *
 *     Determine the block size, the workspace size and the hous size.
 *
-      IB     = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+      IB       = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 )
+      IF( N.EQ.0 .OR. KD.LE.1 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
          INFO = -1
@@ -315,8 +324,8 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         HOUS( 1 ) = LHMIN
-         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+         HOUS( 1 ) = SROUNDUP_LWORK( LHMIN )
+         WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -544,8 +553,7 @@
   170     CONTINUE
       ENDIF
 *
-      HOUS( 1 ) = LHMIN
-      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of SSYTRD_SB2ST
diff --git a/lapack-netlib/SRC/ssytrd_sy2sb.f b/lapack-netlib/SRC/ssytrd_sy2sb.f
index 4efc43630..3996e07bb 100644
--- a/lapack-netlib/SRC/ssytrd_sy2sb.f
+++ b/lapack-netlib/SRC/ssytrd_sy2sb.f
@@ -124,7 +124,7 @@
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is REAL array, dimension (LWORK)
-*>          On exit, if INFO = 0, or if LWORK=-1, 
+*>          On exit, if INFO = 0, or if LWORK = -1,
 *>          WORK(1) returns the size of LWORK.
 *> \endverbatim
 *>
@@ -132,7 +132,9 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK which should be calculated
-*>          by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*>          by a workspace query.
+*>          If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY)
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -294,8 +296,12 @@
       INFO   = 0
       UPPER  = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
-      LWMIN  = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 )
-      
+      IF( N.LE.KD+1 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 )
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -314,7 +320,7 @@
          CALL XERBLA( 'SSYTRD_SY2SB', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+         WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
          RETURN
       END IF
 *
@@ -507,7 +513,7 @@
 
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of SSYTRD_SY2SB
diff --git a/lapack-netlib/SRC/ssytrf.f b/lapack-netlib/SRC/ssytrf.f
index a788fbcf0..55f3a4f0f 100644
--- a/lapack-netlib/SRC/ssytrf.f
+++ b/lapack-netlib/SRC/ssytrf.f
@@ -234,7 +234,7 @@
 *
          NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, N*NB )
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -353,7 +353,8 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SSYTRF
diff --git a/lapack-netlib/SRC/ssytrf_aa.f b/lapack-netlib/SRC/ssytrf_aa.f
index d6408a978..af32fb064 100644
--- a/lapack-netlib/SRC/ssytrf_aa.f
+++ b/lapack-netlib/SRC/ssytrf_aa.f
@@ -101,8 +101,10 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >= MAX(1,2*N). For optimum performance
-*>          LWORK >= N*(1+NB), where NB is the optimal blocksize.
+*>          The length of WORK.
+*>          LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise.
+*>          For optimum performance LWORK >= N*(1+NB), where NB is
+*>          the optimal blocksize, returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -128,7 +130,7 @@
 *> \ingroup hetrf_aa
 *
 *  =====================================================================
-      SUBROUTINE SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO)
+      SUBROUTINE SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -142,19 +144,19 @@
 *     ..
 *     .. Array Arguments ..
       INTEGER            IPIV( * )
-      REAL   A( LDA, * ), WORK( * )
+      REAL               A( LDA, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *     .. Parameters ..
-      REAL   ZERO, ONE
+      REAL               ZERO, ONE
       PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY, UPPER
-      INTEGER            J, LWKOPT
+      INTEGER            J, LWKMIN, LWKOPT
       INTEGER            NB, MJ, NJ, K1, K2, J1, J2, J3, JB
-      REAL   ALPHA
+      REAL               ALPHA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -180,19 +182,26 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( N.LE.1 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = 2*N
+         LWKOPT = (NB+1)*N
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = (NB+1)*N
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -204,11 +213,11 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
           RETURN
       ENDIF
       IPIV( 1 ) = 1
-      IF ( N.EQ.1 ) THEN
+      IF( N.EQ.1 ) THEN
          RETURN
       END IF
 *
@@ -458,7 +467,8 @@
       END IF
 *
    20 CONTINUE
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SSYTRF_AA
diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f
index abe6564c5..6b5cdee1b 100644
--- a/lapack-netlib/SRC/ssytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f
@@ -94,7 +94,7 @@
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -121,14 +121,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL workspace of size LWORK
+*>          WORK is REAL workspace of size (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used to
+*>          select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -212,9 +212,9 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -6
-      ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN
          INFO = -10
       END IF
 *
@@ -228,10 +228,10 @@
       NB = ILAENV( 1, 'SSYTRF_AA_2STAGE', UPLO, N, -1, -1, -1 )
       IF( INFO.EQ.0 ) THEN
          IF( TQUERY ) THEN
-            TB( 1 ) = (3*NB+1)*N
+            TB( 1 ) = SROUNDUP_LWORK( MAX( 1, (3*NB+1)*N ) )
          END IF
          IF( WQUERY ) THEN
-            WORK( 1 ) = SROUNDUP_LWORK(N*NB)
+            WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N*NB ) )
          END IF
       END IF
       IF( TQUERY .OR. WQUERY ) THEN
@@ -240,7 +240,7 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
          RETURN
       ENDIF
 *
diff --git a/lapack-netlib/SRC/ssytrf_rk.f b/lapack-netlib/SRC/ssytrf_rk.f
index 72830543c..89ecf38fd 100644
--- a/lapack-netlib/SRC/ssytrf_rk.f
+++ b/lapack-netlib/SRC/ssytrf_rk.f
@@ -177,14 +177,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension ( MAX(1,LWORK) ).
+*>          WORK is REAL array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned
 *>          by ILAENV.
 *>
@@ -312,7 +312,7 @@
 *
          NB = ILAENV( 1, 'SSYTRF_RK', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, N*NB )
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -488,7 +488,7 @@
 *
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SSYTRF_RK
diff --git a/lapack-netlib/SRC/ssytrf_rook.f b/lapack-netlib/SRC/ssytrf_rook.f
index 339a229e7..7c2cbbc57 100644
--- a/lapack-netlib/SRC/ssytrf_rook.f
+++ b/lapack-netlib/SRC/ssytrf_rook.f
@@ -118,7 +118,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
@@ -260,7 +260,7 @@
 *
          NB = ILAENV( 1, 'SSYTRF_ROOK', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, N*NB )
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -383,7 +383,8 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SSYTRF_ROOK
diff --git a/lapack-netlib/SRC/ssytri2.f b/lapack-netlib/SRC/ssytri2.f
index 49f6cad65..fd1c53473 100644
--- a/lapack-netlib/SRC/ssytri2.f
+++ b/lapack-netlib/SRC/ssytri2.f
@@ -88,16 +88,16 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (N+NB+1)*(NB+3)
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          WORK is size >= (N+NB+1)*(NB+3)
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>           calculates:
+*>          calculates:
 *>              - the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array,
 *>              - and no error message related to LWORK is issued by XERBLA.
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetri2
 *
 *  =====================================================================
       SUBROUTINE SSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -147,7 +147,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SSYTRI, SSYTRI2X, XERBLA
@@ -159,9 +160,13 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+*
 *     Get blocksize
+*
       NBMAX = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 )
-      IF ( NBMAX .GE. N ) THEN
+      IF( N.EQ.0 ) THEN
+         MINSIZE = 1
+      ELSE IF( NBMAX.GE.N ) THEN
          MINSIZE = N
       ELSE
          MINSIZE = (N+NBMAX+1)*(NBMAX+3)
@@ -173,28 +178,29 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
-*
-*     Quick return if possible
-*
 *
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SSYTRI2', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK(1)=MINSIZE
+         WORK( 1 ) = SROUNDUP_LWORK( MINSIZE )
          RETURN
       END IF
+*
+*     Quick return if possible
+*
       IF( N.EQ.0 )
      $   RETURN
-
-      IF( NBMAX .GE. N ) THEN
+*
+      IF( NBMAX.GE.N ) THEN
          CALL SSYTRI( UPLO, N, A, LDA, IPIV, WORK, INFO )
       ELSE
          CALL SSYTRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO )
       END IF
+*
       RETURN
 *
 *     End of SSYTRI2
diff --git a/lapack-netlib/SRC/ssytri_3.f b/lapack-netlib/SRC/ssytri_3.f
index bca01105d..f0152a149 100644
--- a/lapack-netlib/SRC/ssytri_3.f
+++ b/lapack-netlib/SRC/ssytri_3.f
@@ -119,16 +119,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (N+NB+1)*(NB+3).
+*>          WORK is REAL array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK. LWORK >= (N+NB+1)*(NB+3).
+*>          The length of WORK.
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>
-*>          If LDWORK = -1, then a workspace query is assumed;
+*>          If LWORK = -1, then a workspace query is assumed;
 *>          the routine only calculates the optimal size of the optimal
 *>          size of the WORK array, returns this value as the first
 *>          entry of the WORK array, and no error message related to
@@ -209,8 +210,13 @@
 *
 *     Determine the block size
 *
-      NB = MAX( 1, ILAENV( 1, 'SSYTRI_3', UPLO, N, -1, -1, -1 ) )
-      LWKOPT = ( N+NB+1 ) * ( NB+3 )
+      IF( N.EQ.0 ) THEN
+         LWKOPT = 1
+      ELSE
+         NB = MAX( 1, ILAENV( 1, 'SSYTRI_3', UPLO, N, -1, -1, -1 ) )
+         LWKOPT = ( N+NB+1 ) * ( NB+3 )
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
@@ -218,7 +224,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN
          INFO = -8
       END IF
 *
@@ -226,7 +232,6 @@
          CALL XERBLA( 'SSYTRI_3', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
@@ -237,7 +242,7 @@
 *
       CALL SSYTRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO )
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssytrs_aa.f b/lapack-netlib/SRC/ssytrs_aa.f
index 12fca0c71..265cf0c1d 100644
--- a/lapack-netlib/SRC/ssytrs_aa.f
+++ b/lapack-netlib/SRC/ssytrs_aa.f
@@ -105,7 +105,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK >= max(1,3*N-2).
+*>          The dimension of the array WORK.
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the minimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -141,7 +147,7 @@
 *     ..
 *     .. Array Arguments ..
       INTEGER            IPIV( * )
-      REAL   A( LDA, * ), B( LDB, * ), WORK( * )
+      REAL               A( LDA, * ), B( LDB, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
@@ -151,24 +157,31 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, UPPER
-      INTEGER            K, KP, LWKOPT
+      INTEGER            K, KP, LWKMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
+      EXTERNAL           LSAME
       REAL               SROUNDUP_LWORK
-      EXTERNAL           LSAME, SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGTSV, SSWAP, SLACPY, STRSM, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX
+      INTRINSIC          MIN, MAX
 *     ..
 *     .. Executable Statements ..
 *
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWKMIN = 1
+      ELSE
+         LWKMIN = 3*N-2
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -179,21 +192,20 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SSYTRS_AA', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         LWKOPT = (3*N-2)
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKMIN )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 .OR. NRHS.EQ.0 )
+      IF( MIN( N, NRHS ).EQ.0 )
      $   RETURN
 *
       IF( UPPER ) THEN
diff --git a/lapack-netlib/SRC/zbbcsd.f b/lapack-netlib/SRC/zbbcsd.f
index 757e98c71..6601f4a06 100644
--- a/lapack-netlib/SRC/zbbcsd.f
+++ b/lapack-netlib/SRC/zbbcsd.f
@@ -804,7 +804,7 @@
                CALL DLARTGP( B22BULGE, B22E(I-1), RWORK(IU2SN+I-1),
      $                       RWORK(IU2CS+I-1), R )
             ELSE IF( NU .LT. MU ) THEN
-               CALL DLARTGS( B21E(I), B21E(I+1), NU, RWORK(IU2CS+I-1),
+               CALL DLARTGS( B21E(I), B21D(I+1), NU, RWORK(IU2CS+I-1),
      $                       RWORK(IU2SN+I-1) )
             ELSE
                CALL DLARTGS( B22D(I), B22E(I), MU, RWORK(IU2CS+I-1),
diff --git a/lapack-netlib/SRC/zgebrd.f b/lapack-netlib/SRC/zgebrd.f
index f1791c6a4..c1a6169a7 100644
--- a/lapack-netlib/SRC/zgebrd.f
+++ b/lapack-netlib/SRC/zgebrd.f
@@ -122,7 +122,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= max(1,M,N).
+*>          The length of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise.
 *>          For optimum performance LWORK >= (M+N)*NB, where NB
 *>          is the optimal blocksize.
 *>
@@ -147,7 +148,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup gebrd
 *
 *> \par Further Details:
 *  =====================
@@ -223,8 +224,8 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB,
-     $                   NBMIN, NX, WS
+      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT,
+     $                   MINMN, NB, NBMIN, NX, WS
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, ZGEBD2, ZGEMM, ZLABRD
@@ -241,9 +242,17 @@
 *     Test the input parameters
 *
       INFO = 0
-      NB = MAX( 1, ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) )
-      LWKOPT = ( M+N )*NB
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = MAX( M, N )
+         NB = MAX( 1, ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) )
+         LWKOPT = ( M+N )*NB
+      END IF
       WORK( 1 ) = DBLE( LWKOPT )
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -251,7 +260,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.LT.0 ) THEN
@@ -263,7 +272,6 @@
 *
 *     Quick return if possible
 *
-      MINMN = MIN( M, N )
       IF( MINMN.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -282,7 +290,7 @@
 *        Determine when to switch from blocked to unblocked code.
 *
          IF( NX.LT.MINMN ) THEN
-            WS = ( M+N )*NB
+            WS = LWKOPT
             IF( LWORK.LT.WS ) THEN
 *
 *              Not enough work space for the optimal NB, consider using
diff --git a/lapack-netlib/SRC/zgedmd.f90 b/lapack-netlib/SRC/zgedmd.f90
index 090641ad8..5045cb166 100644
--- a/lapack-netlib/SRC/zgedmd.f90
+++ b/lapack-netlib/SRC/zgedmd.f90
@@ -1,389 +1,539 @@
-      SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,   &
-                         M, N, X, LDX, Y, LDY, NRNK, TOL,   &
-                         K, EIGS, Z, LDZ, RES, B,    LDB,   &
-                         W, LDW,  S, LDS, ZWORK,  LZWORK,   &
-                         RWORK, LRWORK, IWORK, LIWORK, INFO )
-! March 2023
-!.....
-      USE                   iso_fortran_env
-      IMPLICIT NONE
-      INTEGER, PARAMETER :: WP = real64
-
-!.....
-!     Scalar arguments
-      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
-      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
-                                 NRNK, LDZ, LDB, LDW,  LDS, &
-                                 LIWORK, LRWORK, LZWORK
-      INTEGER,       INTENT(OUT)  :: K, INFO
-      REAL(KIND=WP), INTENT(IN)   ::    TOL
-!     Array arguments
-      COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
-      COMPLEX(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
-                                         W(LDW,*), S(LDS,*)
-      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
-      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
-      REAL(KIND=WP),    INTENT(OUT)   :: RES(*)
-      REAL(KIND=WP),    INTENT(OUT)   :: RWORK(*)
-      INTEGER,          INTENT(OUT)   :: IWORK(*)
-!............................................................
-!     Purpose
-!     =======
-!     ZGEDMD computes the Dynamic Mode Decomposition (DMD) for
-!     a pair of data snapshot matrices. For the input matrices
-!     X and Y such that Y = A*X with an unaccessible matrix
-!     A, ZGEDMD computes a certain number of Ritz pairs of A using
-!     the standard Rayleigh-Ritz extraction from a subspace of
-!     range(X) that is determined using the leading left singular
-!     vectors of X. Optionally, ZGEDMD returns the residuals
-!     of the computed Ritz pairs, the information needed for
-!     a refinement of the Ritz vectors, or the eigenvectors of
-!     the Exact DMD.
-!     For further details see the references listed
-!     below. For more details of the implementation see [3].
-!
-!     References
-!     ==========
-!     [1] P. Schmid: Dynamic mode decomposition of numerical
-!         and experimental data,
-!         Journal of Fluid Mechanics 656, 5-28, 2010.
-!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
-!         decompositions: analysis and enhancements,
-!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
-!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
-!         Mode Decomposition I. Technical report. AIMDyn Inc.
-!         and LAPACK Working Note 298.
-!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
-!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
-!         Theory and Applications, Journal of Computational
-!         Dynamics 1(2), 391 -421, 2014.
+!> \brief \b ZGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices.
+!
+!  =========== DOCUMENTATION ===========
+!
+!  Definition:
+!  ===========
 !
+!      SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,   &
+!                         M, N, X, LDX, Y, LDY, NRNK, TOL,   &
+!                         K, EIGS, Z, LDZ, RES, B,    LDB,   &
+!                         W, LDW,  S, LDS, ZWORK,  LZWORK,   &
+!                         RWORK, LRWORK, IWORK, LIWORK, INFO )
+!......
+!      USE                   iso_fortran_env
+!      IMPLICIT NONE
+!      INTEGER, PARAMETER :: WP = real64
+! 
+!......
+!      Scalar arguments
+!      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+!      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+!                                 NRNK, LDZ, LDB, LDW,  LDS, &
+!                                 LIWORK, LRWORK, LZWORK
+!      INTEGER,       INTENT(OUT)  :: K, INFO
+!      REAL(KIND=WP), INTENT(IN)   ::    TOL
+!      Array arguments
+!      COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+!      COMPLEX(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+!                                         W(LDW,*), S(LDS,*)
+!      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+!      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+!      REAL(KIND=WP),    INTENT(OUT)   :: RES(*)
+!      REAL(KIND=WP),    INTENT(OUT)   :: RWORK(*)
+!      INTEGER,          INTENT(OUT)   :: IWORK(*)
+!
+!............................................................
+!>    \par Purpose:
+!     =============
+!>    \verbatim
+!>    ZGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!>    a pair of data snapshot matrices. For the input matrices
+!>    X and Y such that Y = A*X with an unaccessible matrix
+!>    A, ZGEDMD computes a certain number of Ritz pairs of A using
+!>    the standard Rayleigh-Ritz extraction from a subspace of
+!>    range(X) that is determined using the leading left singular
+!>    vectors of X. Optionally, ZGEDMD returns the residuals
+!>    of the computed Ritz pairs, the information needed for
+!>    a refinement of the Ritz vectors, or the eigenvectors of
+!>    the Exact DMD.
+!>    For further details see the references listed
+!>    below. For more details of the implementation see [3].
+!>    \endverbatim
+!............................................................
+!>    \par References:
+!     ================
+!>    \verbatim
+!>    [1] P. Schmid: Dynamic mode decomposition of numerical
+!>        and experimental data,
+!>        Journal of Fluid Mechanics 656, 5-28, 2010.
+!>    [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!>        decompositions: analysis and enhancements,
+!>        SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!>    [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!>        Mode Decomposition I. Technical report. AIMDyn Inc.
+!>        and LAPACK Working Note 298.
+!>    [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!>        Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!>        Theory and Applications, Journal of Computational
+!>        Dynamics 1(2), 391 -421, 2014.
+!>    \endverbatim
 !......................................................................
-!     Developed and supported by:
-!     ===========================
-!     Developed and coded by Zlatko Drmac, Faculty of Science,
-!     University of Zagreb;  drmac@math.hr
-!     In cooperation with
-!     AIMdyn Inc., Santa Barbara, CA.
-!     and supported by
-!     - DARPA SBIR project "Koopman Operator-Based Forecasting
-!     for Nonstationary Processes from Near-Term, Limited
-!     Observational Data" Contract No: W31P4Q-21-C-0007
-!     - DARPA PAI project "Physics-Informed Machine Learning
-!     Methodologies" Contract No: HR0011-18-9-0033
-!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
-!     Framework for Space-Time Analysis of Process Dynamics"
-!     Contract No: HR0011-16-C-0116
-!     Any opinions, findings and conclusions or recommendations
-!     expressed in this material are those of the author and
-!     do not necessarily reflect the views of the DARPA SBIR
-!     Program Office
-!============================================================
-!     Distribution Statement A:
-!     Approved for Public Release, Distribution Unlimited.
-!     Cleared by DARPA on September 29, 2022
-!============================================================
+!>    \par Developed and supported by:
+!     ================================
+!>    \verbatim
+!>    Developed and coded by Zlatko Drmac, Faculty of Science,
+!>    University of Zagreb;  drmac@math.hr
+!>    In cooperation with
+!>    AIMdyn Inc., Santa Barbara, CA.
+!>    and supported by
+!>    - DARPA SBIR project "Koopman Operator-Based Forecasting
+!>    for Nonstationary Processes from Near-Term, Limited
+!>    Observational Data" Contract No: W31P4Q-21-C-0007
+!>    - DARPA PAI project "Physics-Informed Machine Learning
+!>    Methodologies" Contract No: HR0011-18-9-0033
+!>    - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!>    Framework for Space-Time Analysis of Process Dynamics"
+!>    Contract No: HR0011-16-C-0116
+!>    Any opinions, findings and conclusions or recommendations
+!>    expressed in this material are those of the author and
+!>    do not necessarily reflect the views of the DARPA SBIR
+!>    Program Office
+!>    \endverbatim
+!......................................................................
+!>    \par Distribution Statement A:
+!     ==============================
+!>    \verbatim
+!>    Approved for Public Release, Distribution Unlimited.
+!>    Cleared by DARPA on September 29, 2022
+!>    \endverbatim
 !............................................................
 !     Arguments
 !     =========
-!     JOBS (input) CHARACTER*1
-!     Determines whether the initial data snapshots are scaled
-!     by a diagonal matrix.
-!     'S' :: The data snapshots matrices X and Y are multiplied
-!            with a diagonal matrix D so that X*D has unit
-!            nonzero columns (in the Euclidean 2-norm)
-!     'C' :: The snapshots are scaled as with the 'S' option.
-!            If it is found that an i-th column of X is zero
-!            vector and the corresponding i-th column of Y is
-!            non-zero, then the i-th column of Y is set to
-!            zero and a warning flag is raised.
-!     'Y' :: The data snapshots matrices X and Y are multiplied
-!            by a diagonal matrix D so that Y*D has unit
-!            nonzero columns (in the Euclidean 2-norm)
-!     'N' :: No data scaling.
+!
+!>    \param[in] JOBS
+!>    \verbatim
+!>    JOBS (input) CHARACTER*1
+!>    Determines whether the initial data snapshots are scaled
+!>    by a diagonal matrix.
+!>    'S' :: The data snapshots matrices X and Y are multiplied
+!>           with a diagonal matrix D so that X*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'C' :: The snapshots are scaled as with the 'S' option.
+!>           If it is found that an i-th column of X is zero
+!>           vector and the corresponding i-th column of Y is
+!>           non-zero, then the i-th column of Y is set to
+!>           zero and a warning flag is raised.
+!>    'Y' :: The data snapshots matrices X and Y are multiplied
+!>           by a diagonal matrix D so that Y*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'N' :: No data scaling.
+!>    \endverbatim
 !.....
-!     JOBZ (input) CHARACTER*1
-!     Determines whether the eigenvectors (Koopman modes) will
-!     be computed.
-!     'V' :: The eigenvectors (Koopman modes) will be computed
-!            and returned in the matrix Z.
-!            See the description of Z.
-!     'F' :: The eigenvectors (Koopman modes) will be returned
-!            in factored form as the product X(:,1:K)*W, where X
-!            contains a POD basis (leading left singular vectors
-!            of the data matrix X) and W contains the eigenvectors
-!            of the corresponding Rayleigh quotient.
-!            See the descriptions of K, X, W, Z.
-!     'N' :: The eigenvectors are not computed.
+!>    \param[in] JOBZ
+!>    \verbatim
+!>    JOBZ (input) CHARACTER*1
+!>    Determines whether the eigenvectors (Koopman modes) will
+!>    be computed.
+!>    'V' :: The eigenvectors (Koopman modes) will be computed
+!>           and returned in the matrix Z.
+!>           See the description of Z.
+!>    'F' :: The eigenvectors (Koopman modes) will be returned
+!>           in factored form as the product X(:,1:K)*W, where X
+!>           contains a POD basis (leading left singular vectors
+!>           of the data matrix X) and W contains the eigenvectors
+!>           of the corresponding Rayleigh quotient.
+!>           See the descriptions of K, X, W, Z.
+!>    'N' :: The eigenvectors are not computed.
+!>    \endverbatim
 !.....
-!     JOBR (input) CHARACTER*1
-!     Determines whether to compute the residuals.
-!     'R' :: The residuals for the computed eigenpairs will be
-!            computed and stored in the array RES.
-!            See the description of RES.
-!            For this option to be legal, JOBZ must be 'V'.
-!     'N' :: The residuals are not computed.
+!>    \param[in] JOBR
+!>    \verbatim
+!>    JOBR (input) CHARACTER*1
+!>    Determines whether to compute the residuals.
+!>    'R' :: The residuals for the computed eigenpairs will be
+!>           computed and stored in the array RES.
+!>           See the description of RES.
+!>           For this option to be legal, JOBZ must be 'V'.
+!>    'N' :: The residuals are not computed.
+!>    \endverbatim
 !.....
-!     JOBF (input) CHARACTER*1
-!     Specifies whether to store information needed for post-
-!     processing (e.g. computing refined Ritz vectors)
-!     'R' :: The matrix needed for the refinement of the Ritz
-!            vectors is computed and stored in the array B.
-!            See the description of B.
-!     'E' :: The unscaled eigenvectors of the Exact DMD are
-!            computed and returned in the array B. See the
-!            description of B.
-!     'N' :: No eigenvector refinement data is computed.
+!>    \param[in] JOBF
+!>    \verbatim
+!>    JOBF (input) CHARACTER*1
+!>    Specifies whether to store information needed for post-
+!>    processing (e.g. computing refined Ritz vectors)
+!>    'R' :: The matrix needed for the refinement of the Ritz
+!>           vectors is computed and stored in the array B.
+!>           See the description of B.
+!>    'E' :: The unscaled eigenvectors of the Exact DMD are
+!>           computed and returned in the array B. See the
+!>           description of B.
+!>    'N' :: No eigenvector refinement data is computed.
+!>    \endverbatim
 !.....
-!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
-!     Allows for a selection of the SVD algorithm from the
-!     LAPACK library.
-!     1 :: ZGESVD (the QR SVD algorithm)
-!     2 :: ZGESDD (the Divide and Conquer algorithm; if enough
-!          workspace available, this is the fastest option)
-!     3 :: ZGESVDQ (the preconditioned QR SVD  ; this and 4
-!          are the most accurate options)
-!     4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3
-!          are the most accurate options)
-!     For the four methods above, a significant difference in
-!     the accuracy of small singular values is possible if
-!     the snapshots vary in norm so that X is severely
-!     ill-conditioned. If small (smaller than EPS*||X||)
-!     singular values are of interest and JOBS=='N',  then
-!     the options (3, 4) give the most accurate results, where
-!     the option 4 is slightly better and with stronger
-!     theoretical background.
-!     If JOBS=='S', i.e. the columns of X will be normalized,
-!     then all methods give nearly equally accurate results.
+!>    \param[in] WHTSVD
+!>    \verbatim
+!>    WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!>    Allows for a selection of the SVD algorithm from the
+!>    LAPACK library.
+!>    1 :: ZGESVD (the QR SVD algorithm)
+!>    2 :: ZGESDD (the Divide and Conquer algorithm; if enough
+!>         workspace available, this is the fastest option)
+!>    3 :: ZGESVDQ (the preconditioned QR SVD  ; this and 4
+!>         are the most accurate options)
+!>    4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3
+!>         are the most accurate options)
+!>    For the four methods above, a significant difference in
+!>    the accuracy of small singular values is possible if
+!>    the snapshots vary in norm so that X is severely
+!>    ill-conditioned. If small (smaller than EPS*||X||)
+!>    singular values are of interest and JOBS=='N',  then
+!>    the options (3, 4) give the most accurate results, where
+!>    the option 4 is slightly better and with stronger
+!>    theoretical background.
+!>    If JOBS=='S', i.e. the columns of X will be normalized,
+!>    then all methods give nearly equally accurate results.
+!>    \endverbatim
 !.....
-!     M (input) INTEGER, M>= 0
-!     The state space dimension (the row dimension of X, Y).
+!>    \param[in] M
+!>    \verbatim
+!>    M (input) INTEGER, M>= 0
+!>    The state space dimension (the row dimension of X, Y).
+!>    \endverbatim
 !.....
-!     N (input) INTEGER, 0 <= N <= M
-!     The number of data snapshot pairs
-!     (the number of columns of X and Y).
+!>    \param[in] N
+!>    \verbatim
+!>    N (input) INTEGER, 0 <= N <= M
+!>    The number of data snapshot pairs
+!>    (the number of columns of X and Y).
+!>    \endverbatim
 !.....
-!     X (input/output) COMPLEX(KIND=WP) M-by-N array
-!   > On entry, X contains the data snapshot matrix X. It is
-!     assumed that the column norms of X are in the range of
-!     the normalized floating point numbers.
-!   < On exit, the leading K columns of X contain a POD basis,
-!     i.e. the leading K left singular vectors of the input
-!     data matrix X, U(:,1:K). All N columns of X contain all
-!     left singular vectors of the input matrix X.
-!     See the descriptions of K, Z and W.
+!>    \param[in] LDX
+!>    \verbatim
+!>    X (input/output) COMPLEX(KIND=WP) M-by-N array
+!>    > On entry, X contains the data snapshot matrix X. It is
+!>    assumed that the column norms of X are in the range of
+!>    the normalized floating point numbers.
+!>    < On exit, the leading K columns of X contain a POD basis,
+!>    i.e. the leading K left singular vectors of the input
+!>    data matrix X, U(:,1:K). All N columns of X contain all
+!>    left singular vectors of the input matrix X.
+!>    See the descriptions of K, Z and W.
 !.....
-!     LDX (input) INTEGER, LDX >= M
-!     The leading dimension of the array X.
+!>    LDX (input) INTEGER, LDX >= M
+!>    The leading dimension of the array X.
+!>    \endverbatim
 !.....
-!     Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array
-!   > On entry, Y contains the data snapshot matrix Y
-!   < On exit,
-!     If JOBR == 'R', the leading K columns of Y  contain
-!     the residual vectors for the computed Ritz pairs.
-!     See the description of RES.
-!     If JOBR == 'N', Y contains the original input data,
-!                     scaled according to the value of JOBS.
+!>    \param[in,out] Y
+!>    \verbatim
+!>    Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array
+!>    > On entry, Y contains the data snapshot matrix Y
+!>    < On exit,
+!>    If JOBR == 'R', the leading K columns of Y  contain
+!>    the residual vectors for the computed Ritz pairs.
+!>    See the description of RES.
+!>    If JOBR == 'N', Y contains the original input data,
+!>                    scaled according to the value of JOBS.
+!>    \endverbatim
 !.....
-!     LDY (input) INTEGER , LDY >= M
-!     The leading dimension of the array Y.
+!>    \param[in] LDY
+!>    \verbatim
+!>    LDY (input) INTEGER , LDY >= M
+!>    The leading dimension of the array Y.
+!>    \endverbatim
 !.....
-!     NRNK (input) INTEGER
-!     Determines the mode how to compute the numerical rank,
-!     i.e. how to truncate small singular values of the input
-!     matrix X. On input, if
-!     NRNK = -1 :: i-th singular value sigma(i) is truncated
-!                  if sigma(i) <= TOL*sigma(1)
-!                  This option is recommended.
-!     NRNK = -2 :: i-th singular value sigma(i) is truncated
-!                  if sigma(i) <= TOL*sigma(i-1)
-!                  This option is included for R&D purposes.
-!                  It requires highly accurate SVD, which
-!                  may not be feasible.
-!     The numerical rank can be enforced by using positive
-!     value of NRNK as follows:
-!     0 < NRNK <= N :: at most NRNK largest singular values
-!     will be used. If the number of the computed nonzero
-!     singular values is less than NRNK, then only those
-!     nonzero values will be used and the actually used
-!     dimension is less than NRNK. The actual number of
-!     the nonzero singular values is returned in the variable
-!     K. See the descriptions of TOL and  K.
+!>    \param[in] NRNK
+!>    \verbatim
+!>    NRNK (input) INTEGER
+!>    Determines the mode how to compute the numerical rank,
+!>    i.e. how to truncate small singular values of the input
+!>    matrix X. On input, if
+!>    NRNK = -1 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(1)
+!>                 This option is recommended.
+!>    NRNK = -2 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(i-1)
+!>                 This option is included for R&D purposes.
+!>                 It requires highly accurate SVD, which
+!>                 may not be feasible.
+!>    The numerical rank can be enforced by using positive
+!>    value of NRNK as follows:
+!>    0 < NRNK <= N :: at most NRNK largest singular values
+!>    will be used. If the number of the computed nonzero
+!>    singular values is less than NRNK, then only those
+!>    nonzero values will be used and the actually used
+!>    dimension is less than NRNK. The actual number of
+!>    the nonzero singular values is returned in the variable
+!>    K. See the descriptions of TOL and  K.
+!>    \endverbatim
 !.....
-!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
-!     The tolerance for truncating small singular values.
-!     See the description of NRNK.
+!>    \param[in] TOL
+!>    \verbatim
+!>    TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!>    The tolerance for truncating small singular values.
+!>    See the description of NRNK.
+!>    \endverbatim
 !.....
-!     K (output) INTEGER,  0 <= K <= N
-!     The dimension of the POD basis for the data snapshot
-!     matrix X and the number of the computed Ritz pairs.
-!     The value of K is determined according to the rule set
-!     by the parameters NRNK and TOL.
-!     See the descriptions of NRNK and TOL.
+!>    \param[out] K
+!>    \verbatim
+!>    K (output) INTEGER,  0 <= K <= N
+!>    The dimension of the POD basis for the data snapshot
+!>    matrix X and the number of the computed Ritz pairs.
+!>    The value of K is determined according to the rule set
+!>    by the parameters NRNK and TOL.
+!>    See the descriptions of NRNK and TOL.
+!>    \endverbatim
 !.....
-!     EIGS (output) COMPLEX(KIND=WP) N-by-1 array
-!     The leading K (K<=N) entries of EIGS contain
-!     the computed eigenvalues (Ritz values).
-!     See the descriptions of K, and Z.
+!>    \param[out] EIGS
+!>    \verbatim
+!>    EIGS (output) COMPLEX(KIND=WP) N-by-1 array
+!>    The leading K (K<=N) entries of EIGS contain
+!>    the computed eigenvalues (Ritz values).
+!>    See the descriptions of K, and Z.
+!>    \endverbatim
 !.....
-!     Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array
-!     If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i)
-!     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
-!     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
-!     the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i)
-!     is an eigenvector corresponding to EIGS(i). The columns
-!     of W(1:k,1:K) are the computed eigenvectors of the
-!     K-by-K Rayleigh quotient.
-!     See the descriptions of EIGS, X and W.
+!>    \param[out] Z
+!>    \verbatim
+!>    Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array
+!>    If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i)
+!>    is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
+!>    If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
+!>    the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i)
+!>    is an eigenvector corresponding to EIGS(i). The columns
+!>    of W(1:k,1:K) are the computed eigenvectors of the
+!>    K-by-K Rayleigh quotient.
+!>    See the descriptions of EIGS, X and W.
+!>    \endverbatim
 !.....
-!     LDZ (input) INTEGER , LDZ >= M
-!     The leading dimension of the array Z.
+!>    \param[in] LDZ
+!>    \verbatim
+!>    LDZ (input) INTEGER , LDZ >= M
+!>    The leading dimension of the array Z.
+!>    \endverbatim
 !.....
-!     RES (output) REAL(KIND=WP) N-by-1 array
-!     RES(1:K) contains the residuals for the K computed
-!     Ritz pairs,
-!     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
-!     See the description of EIGS and Z.
+!>    \param[out] RES
+!>    \verbatim
+!>    RES (output) REAL(KIND=WP) N-by-1 array
+!>    RES(1:K) contains the residuals for the K computed
+!>    Ritz pairs,
+!>    RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
+!>    See the description of EIGS and Z.
+!>    \endverbatim
 !.....
-!     B (output) COMPLEX(KIND=WP)  M-by-N array.
-!     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
-!     be used for computing the refined vectors; see further
-!     details in the provided references.
-!     If JOBF == 'E', B(1:M,1:K) contains
-!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
-!     Exact DMD, up to scaling by the inverse eigenvalues.
-!     If JOBF =='N', then B is not referenced.
-!     See the descriptions of X, W, K.
+!>    \param[out] B
+!>    \verbatim
+!>    B (output) COMPLEX(KIND=WP)  M-by-N array.
+!>    IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!>    be used for computing the refined vectors; see further
+!>    details in the provided references.
+!>    If JOBF == 'E', B(1:M,1:K) contains
+!>    A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!>    Exact DMD, up to scaling by the inverse eigenvalues.
+!>    If JOBF =='N', then B is not referenced.
+!>    See the descriptions of X, W, K.
+!>    \endverbatim
 !.....
-!     LDB (input) INTEGER, LDB >= M
-!     The leading dimension of the array B.
+!>    \param[in] LDB
+!>    \verbatim
+!>    LDB (input) INTEGER, LDB >= M
+!>    The leading dimension of the array B.
+!>    \endverbatim
 !.....
-!     W (workspace/output) COMPLEX(KIND=WP) N-by-N array
-!     On exit, W(1:K,1:K) contains the K computed
-!     eigenvectors of the matrix Rayleigh quotient.
-!     The Ritz vectors (returned in Z) are the
-!     product of X (containing a POD basis for the input
-!     matrix X) and W. See the descriptions of K, S, X and Z.
-!     W is also used as a workspace to temporarily store the
-!     right singular vectors of X.
+!>    \param[out] W
+!>    \verbatim
+!>    W (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!>    On exit, W(1:K,1:K) contains the K computed
+!>    eigenvectors of the matrix Rayleigh quotient.
+!>    The Ritz vectors (returned in Z) are the
+!>    product of X (containing a POD basis for the input
+!>    matrix X) and W. See the descriptions of K, S, X and Z.
+!>    W is also used as a workspace to temporarily store the
+!>    right singular vectors of X.
+!>    \endverbatim
 !.....
-!     LDW (input) INTEGER, LDW >= N
-!     The leading dimension of the array W.
+!>    \param[in] LDW
+!>    \verbatim
+!>    LDW (input) INTEGER, LDW >= N
+!>    The leading dimension of the array W.
+!>    \endverbatim
 !.....
-!     S (workspace/output) COMPLEX(KIND=WP) N-by-N array
-!     The array S(1:K,1:K) is used for the matrix Rayleigh
-!     quotient. This content is overwritten during
-!     the eigenvalue decomposition by ZGEEV.
-!     See the description of K.
+!>    \param[out] S
+!>    \verbatim
+!>    S (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!>    The array S(1:K,1:K) is used for the matrix Rayleigh
+!>    quotient. This content is overwritten during
+!>    the eigenvalue decomposition by ZGEEV.
+!>    See the description of K.
+!>    \endverbatim
 !.....
-!     LDS (input) INTEGER, LDS >= N
-!     The leading dimension of the array S.
+!>    \param[in] LDS
+!>    \verbatim
+!>    LDS (input) INTEGER, LDS >= N
+!>    The leading dimension of the array S.
+!>    \endverbatim
 !.....
-!     ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array
-!     ZWORK is used as complex workspace in the complex SVD, as
-!     specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing
-!     the eigenvalues of a Rayleigh quotient.
-!     If the call to ZGEDMD is only workspace query, then
-!     ZWORK(1) contains the minimal complex workspace length and
-!     ZWORK(2) is the optimal complex workspace length.
-!     Hence, the length of work is at least 2.
-!     See the description of LZWORK.
+!>    \param[out] ZWORK
+!>    \verbatim
+!>    ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array
+!>    ZWORK is used as complex workspace in the complex SVD, as
+!>    specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing
+!>    the eigenvalues of a Rayleigh quotient.
+!>    If the call to ZGEDMD is only workspace query, then
+!>    ZWORK(1) contains the minimal complex workspace length and
+!>    ZWORK(2) is the optimal complex workspace length.
+!>    Hence, the length of work is at least 2.
+!>    See the description of LZWORK.
+!>    \endverbatim
 !.....
-!     LZWORK (input) INTEGER
-!     The minimal length of the workspace vector ZWORK.
-!     LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV),
-!     where LZWORK_ZGEEV = MAX( 1, 2*N )  and the minimal
-!     LZWORK_SVD is calculated as follows
-!     If WHTSVD == 1 :: ZGESVD ::
-!        LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N))
-!     If WHTSVD == 2 :: ZGESDD ::
-!        LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
-!     If WHTSVD == 3 :: ZGESVDQ ::
-!        LZWORK_SVD = obtainable by a query
-!     If WHTSVD == 4 :: ZGEJSV ::
-!        LZWORK_SVD = obtainable by a query
-!     If on entry LZWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     and the optimal workspace lengths and returns them in
-!     LZWORK(1) and LZWORK(2), respectively.
+!>    \param[in] LZWORK
+!>    \verbatim
+!>    LZWORK (input) INTEGER
+!>    The minimal length of the workspace vector ZWORK.
+!>    LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV),
+!>    where LZWORK_ZGEEV = MAX( 1, 2*N )  and the minimal
+!>    LZWORK_SVD is calculated as follows
+!>    If WHTSVD == 1 :: ZGESVD ::
+!>       LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+!>    If WHTSVD == 2 :: ZGESDD ::
+!>       LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
+!>    If WHTSVD == 3 :: ZGESVDQ ::
+!>       LZWORK_SVD = obtainable by a query
+!>    If WHTSVD == 4 :: ZGEJSV ::
+!>       LZWORK_SVD = obtainable by a query
+!>    If on entry LZWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths and returns them in
+!>    LZWORK(1) and LZWORK(2), respectively.
+!>    \endverbatim
 !.....
-!     RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array
-!     On exit, RWORK(1:N) contains the singular values of
-!     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
-!     If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain
-!     scaling factor RWORK(N+2)/RWORK(N+1) used to scale X
-!     and Y to avoid overflow in the SVD of X.
-!     This may be of interest if the scaling option is off
-!     and as many as possible smallest eigenvalues are
-!     desired to the highest feasible accuracy.
-!     If the call to ZGEDMD is only workspace query, then
-!     RWORK(1) contains the minimal workspace length.
-!     See the description of LRWORK.
+!>    \param[out] RWORK
+!>    \verbatim
+!>    RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array
+!>    On exit, RWORK(1:N) contains the singular values of
+!>    X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!>    If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain
+!>    scaling factor RWORK(N+2)/RWORK(N+1) used to scale X
+!>    and Y to avoid overflow in the SVD of X.
+!>    This may be of interest if the scaling option is off
+!>    and as many as possible smallest eigenvalues are
+!>    desired to the highest feasible accuracy.
+!>    If the call to ZGEDMD is only workspace query, then
+!>    RWORK(1) contains the minimal workspace length.
+!>    See the description of LRWORK.
+!>    \endverbatim
 !.....
-!     LRWORK (input) INTEGER
-!     The minimal length of the workspace vector RWORK.
-!     LRWORK is calculated as follows:
-!     LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where
-!     LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace
-!     for the SVD subroutine determined by the input parameter
-!     WHTSVD.
-!     If WHTSVD == 1 :: ZGESVD ::
-!        LRWORK_SVD = 5*MIN(M,N)
-!     If WHTSVD == 2 :: ZGESDD ::
-!        LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N),
-!        2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
-!     If WHTSVD == 3 :: ZGESVDQ ::
-!        LRWORK_SVD = obtainable by a query
-!     If WHTSVD == 4 :: ZGEJSV ::
-!        LRWORK_SVD = obtainable by a query
-!     If on entry LRWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     real workspace length and returns it in RWORK(1).
+!>    \param[in] LRWORK
+!>    \verbatim
+!>    LRWORK (input) INTEGER
+!>    The minimal length of the workspace vector RWORK.
+!>    LRWORK is calculated as follows:
+!>    LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where
+!>    LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace
+!>    for the SVD subroutine determined by the input parameter
+!>    WHTSVD.
+!>    If WHTSVD == 1 :: ZGESVD ::
+!>       LRWORK_SVD = 5*MIN(M,N)
+!>    If WHTSVD == 2 :: ZGESDD ::
+!>       LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N),
+!>       2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
+!>    If WHTSVD == 3 :: ZGESVDQ ::
+!>       LRWORK_SVD = obtainable by a query
+!>    If WHTSVD == 4 :: ZGEJSV ::
+!>       LRWORK_SVD = obtainable by a query
+!>    If on entry LRWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    real workspace length and returns it in RWORK(1).
+!>    \endverbatim
 !.....
-!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
-!     Workspace that is required only if WHTSVD equals
-!     2 , 3 or 4. (See the description of WHTSVD).
-!     If on entry LWORK =-1 or LIWORK=-1, then the
-!     minimal length of IWORK is computed and returned in
-!     IWORK(1). See the description of LIWORK.
+!>    \param[out] IWORK
+!>    \verbatim
+!>    IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!>    Workspace that is required only if WHTSVD equals
+!>    2 , 3 or 4. (See the description of WHTSVD).
+!>    If on entry LWORK =-1 or LIWORK=-1, then the
+!>    minimal length of IWORK is computed and returned in
+!>    IWORK(1). See the description of LIWORK.
+!>    \endverbatim
 !.....
-!     LIWORK (input) INTEGER
-!     The minimal length of the workspace vector IWORK.
-!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
-!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
-!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
-!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
-!     If on entry LIWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     and the optimal workspace lengths for  ZWORK, RWORK and
-!     IWORK. See the descriptions of ZWORK, RWORK and IWORK.
+!>    \param[in] LIWORK
+!>    \verbatim
+!>    LIWORK (input) INTEGER
+!>    The minimal length of the workspace vector IWORK.
+!>    If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!>    If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!>    If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!>    If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!>    If on entry LIWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths for  ZWORK, RWORK and
+!>    IWORK. See the descriptions of ZWORK, RWORK and IWORK.
+!>    \endverbatim
 !.....
-!     INFO (output) INTEGER
-!     -i < 0 :: On entry, the i-th argument had an
-!               illegal value
-!        = 0 :: Successful return.
-!        = 1 :: Void input. Quick exit (M=0 or N=0).
-!        = 2 :: The SVD computation of X did not converge.
-!               Suggestion: Check the input data and/or
-!               repeat with different WHTSVD.
-!        = 3 :: The computation of the eigenvalues did not
-!               converge.
-!        = 4 :: If data scaling was requested on input and
-!               the procedure found inconsistency in the data
-!               such that for some column index i,
-!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
-!               to zero if JOBS=='C'. The computation proceeds
-!               with original or modified data and warning
-!               flag is set with INFO=4.
+!>    \param[out] INFO
+!>    \verbatim
+!>    INFO (output) INTEGER
+!>    -i < 0 :: On entry, the i-th argument had an
+!>              illegal value
+!>       = 0 :: Successful return.
+!>       = 1 :: Void input. Quick exit (M=0 or N=0).
+!>       = 2 :: The SVD computation of X did not converge.
+!>              Suggestion: Check the input data and/or
+!>              repeat with different WHTSVD.
+!>       = 3 :: The computation of the eigenvalues did not
+!>              converge.
+!>       = 4 :: If data scaling was requested on input and
+!>              the procedure found inconsistency in the data
+!>              such that for some column index i,
+!>              X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!>              to zero if JOBS=='C'. The computation proceeds
+!>              with original or modified data and warning
+!>              flag is set with INFO=4.
+!>    \endverbatim
+!
+!  Authors:
+!  ========
+!
+!> \author Zlatko Drmac
+!
+!> \ingroup gedmd
+!
 !.............................................................
 !.............................................................
+      SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,   &
+                         M, N, X, LDX, Y, LDY, NRNK, TOL,   &
+                         K, EIGS, Z, LDZ, RES, B,    LDB,   &
+                         W, LDW,  S, LDS, ZWORK,  LZWORK,   &
+                         RWORK, LRWORK, IWORK, LIWORK, INFO )
+!
+!  -- LAPACK driver routine                                           --
+!
+!  -- LAPACK is a software package provided by University of          --
+!  -- Tennessee, University of California Berkeley, University of     --
+!  -- Colorado Denver and NAG Ltd..                                   --
+!
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real64
+!
+!     Scalar arguments
+!     ~~~~~~~~~~~~~~~~
+      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+                                 NRNK, LDZ, LDB, LDW,  LDS, &
+                                 LIWORK, LRWORK, LZWORK
+      INTEGER,       INTENT(OUT)  :: K, INFO
+      REAL(KIND=WP), INTENT(IN)   ::    TOL
+!
+!     Array arguments
+!     ~~~~~~~~~~~~~~~
+      COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+                                         W(LDW,*), S(LDS,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+      REAL(KIND=WP),    INTENT(OUT)   :: RES(*)
+      REAL(KIND=WP),    INTENT(OUT)   :: RWORK(*)
+      INTEGER,          INTENT(OUT)   :: IWORK(*)
+!
 !     Parameters
 !     ~~~~~~~~~~
       REAL(KIND=WP),    PARAMETER ::  ONE = 1.0_WP
       REAL(KIND=WP),    PARAMETER :: ZERO = 0.0_WP
       COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
       COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
-
+!
 !     Local scalars
 !     ~~~~~~~~~~~~~
       REAL(KIND=WP) :: OFL,   ROOTSC, SCALE,  SMALL,    &
@@ -401,7 +551,7 @@
 !     Local arrays
 !     ~~~~~~~~~~~~
       REAL(KIND=WP) :: RDUMMY(2)
-
+!
 !     External functions (BLAS and LAPACK)
 !     ~~~~~~~~~~~~~~~~~
       REAL(KIND=WP) ZLANGE, DLAMCH, DZNRM2
@@ -409,13 +559,13 @@
       INTEGER                               IZAMAX
       LOGICAL       DISNAN, LSAME
       EXTERNAL      DISNAN, LSAME
-
+!
 !     External subroutines (BLAS and LAPACK)
 !     ~~~~~~~~~~~~~~~~~~~~
       EXTERNAL      ZAXPY,  ZGEMM,  ZDSCAL
       EXTERNAL      ZGEEV,  ZGEJSV, ZGESDD, ZGESVD, ZGESVDQ, &
                     ZLACPY, ZLASCL, ZLASSQ, XERBLA
-
+!
 !     Intrinsic functions
 !     ~~~~~~~~~~~~~~~~~~~
       INTRINSIC     DBLE, INT, MAX, SQRT
@@ -608,7 +758,8 @@
           K = 0
           DO i = 1, N
             !WORK(i) = DZNRM2( M, X(1,i), 1 )
-            SCALE  = ZERO
+            SSUM  = ONE
+            SCALE = ZERO
             CALL ZLASSQ( M, X(1,i), 1, SCALE, SSUM )
             IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
                 K    =  0
@@ -681,7 +832,8 @@
           ! carefully computed using ZLASSQ.
           DO i = 1, N
             !RWORK(i) = DZNRM2( M, Y(1,i), 1 )
-            SCALE  = ZERO
+            SSUM  = ONE
+            SCALE = ZERO
             CALL ZLASSQ( M, Y(1,i), 1, SCALE, SSUM )
             IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
                 K    =  0
diff --git a/lapack-netlib/SRC/zgehrd.f b/lapack-netlib/SRC/zgehrd.f
index e18493cf9..0f4424ded 100644
--- a/lapack-netlib/SRC/zgehrd.f
+++ b/lapack-netlib/SRC/zgehrd.f
@@ -89,7 +89,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension (LWORK)
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup gehrd
 *
 *> \par Further Details:
 *  =====================
@@ -173,7 +173,7 @@
       INTEGER            IHI, ILO, INFO, LDA, LWORK, N
 *     ..
 *     .. Array Arguments ..
-      COMPLEX*16        A( LDA, * ), TAU( * ), WORK( * )
+      COMPLEX*16         A( LDA, * ), TAU( * ), WORK( * )
 *     ..
 *
 *  =====================================================================
@@ -182,7 +182,7 @@
       INTEGER            NBMAX, LDT, TSIZE
       PARAMETER          ( NBMAX = 64, LDT = NBMAX+1,
      $                     TSIZE = LDT*NBMAX )
-      COMPLEX*16        ZERO, ONE
+      COMPLEX*16         ZERO, ONE
       PARAMETER          ( ZERO = ( 0.0D+0, 0.0D+0 ),
      $                     ONE = ( 1.0D+0, 0.0D+0 ) )
 *     ..
@@ -190,7 +190,7 @@
       LOGICAL            LQUERY
       INTEGER            I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB,
      $                   NBMIN, NH, NX
-      COMPLEX*16        EI
+      COMPLEX*16         EI
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ZAXPY, ZGEHD2, ZGEMM, ZLAHR2, ZLARFB, ZTRMM,
@@ -221,12 +221,18 @@
          INFO = -8
       END IF
 *
+      NH = IHI - ILO + 1
       IF( INFO.EQ.0 ) THEN
 *
 *        Compute the workspace requirements
 *
-         NB = MIN( NBMAX, ILAENV( 1, 'ZGEHRD', ' ', N, ILO, IHI, -1 ) )
-         LWKOPT = N*NB + TSIZE
+         IF( NH.LE.1 ) THEN
+            LWKOPT = 1
+         ELSE
+            NB = MIN( NBMAX, ILAENV( 1, 'ZGEHRD', ' ', N, ILO, IHI,
+     $                              -1 ) )
+            LWKOPT = N*NB + TSIZE
+         END IF
          WORK( 1 ) = LWKOPT
       ENDIF
 *
@@ -248,7 +254,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -268,7 +273,7 @@
 *
 *           Determine if workspace is large enough for blocked code
 *
-            IF( LWORK.LT.N*NB+TSIZE ) THEN
+            IF( LWORK.LT.LWKOPT ) THEN
 *
 *              Not enough workspace to use optimal NB:  determine the
 *              minimum value of NB, and reduce NB or force use of
diff --git a/lapack-netlib/SRC/zgelq.f b/lapack-netlib/SRC/zgelq.f
index de7c9a378..86610e801 100644
--- a/lapack-netlib/SRC/zgelq.f
+++ b/lapack-netlib/SRC/zgelq.f
@@ -98,7 +98,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -166,6 +166,8 @@
 *>  the LQ factorization.
 *> \endverbatim
 *>
+*> \ingroup gelq
+*>
 *  =====================================================================
       SUBROUTINE ZGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
diff --git a/lapack-netlib/SRC/zgelqf.f b/lapack-netlib/SRC/zgelqf.f
index 6c295eece..e988ea818 100644
--- a/lapack-netlib/SRC/zgelqf.f
+++ b/lapack-netlib/SRC/zgelqf.f
@@ -93,7 +93,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,M).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise.
 *>          For optimum performance LWORK >= M*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -118,7 +119,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup gelqf
 *
 *> \par Further Details:
 *  =====================
@@ -174,9 +175,8 @@
 *     Test the input arguments
 *
       INFO = 0
+      K = MIN( M, N )
       NB = ILAENV( 1, 'ZGELQF', ' ', M, N, -1, -1 )
-      LWKOPT = M*NB
-      WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -184,19 +184,25 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN
-         INFO = -7
+      ELSE IF( .NOT.LQUERY ) THEN
+         IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
+     $      INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZGELQF', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
+         IF( K.EQ.0 ) THEN
+            LWKOPT = 1
+         ELSE
+            LWKOPT = M*NB
+         END IF
+         WORK( 1 ) = LWKOPT
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
diff --git a/lapack-netlib/SRC/zgemlq.f b/lapack-netlib/SRC/zgemlq.f
index 41cd1c059..11489087a 100644
--- a/lapack-netlib/SRC/zgemlq.f
+++ b/lapack-netlib/SRC/zgemlq.f
@@ -109,16 +109,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -142,7 +143,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -158,11 +159,13 @@
 *>  block sizes MB and NB returned by ILAENV, ZGELQ will use either
 *>  ZLASWLQ (if the matrix is wide-and-short) or ZGELQT to compute
 *>  the LQ factorization.
-*>  This version of ZGEMLQ will use either ZLAMSWLQ or ZGEMLQT to 
+*>  This version of ZGEMLQ will use either ZLAMSWLQ or ZGEMLQT to
 *>  multiply matrix Q by another matrix.
 *>  Further Details in ZLAMSWLQ or ZGEMLQT.
 *> \endverbatim
 *>
+*> \ingroup gemlq
+*>
 *  =====================================================================
       SUBROUTINE ZGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -184,7 +187,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -200,7 +203,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -215,6 +218,13 @@
         LW = M * MB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN
@@ -243,7 +253,7 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( ( LWORK.LT.LWMIN ) .AND. ( .NOT.LQUERY ) ) THEN
         INFO = -13
       END IF
 *
@@ -260,7 +270,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/zgemqr.f b/lapack-netlib/SRC/zgemqr.f
index c83eaff2f..d14d74fe2 100644
--- a/lapack-netlib/SRC/zgemqr.f
+++ b/lapack-netlib/SRC/zgemqr.f
@@ -111,16 +111,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -144,7 +145,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -166,6 +167,8 @@
 *>
 *> \endverbatim
 *>
+*> \ingroup gemqr
+*>
 *  =====================================================================
       SUBROUTINE ZGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -187,7 +190,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -203,7 +206,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -218,6 +221,13 @@
         LW = MB * NB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, MB - K ).EQ.0 ) THEN
@@ -246,12 +256,12 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = LW
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -263,7 +273,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -276,7 +286,7 @@
      $                 NB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = LW
+      WORK( 1 ) = LWMIN
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/zgeqlf.f b/lapack-netlib/SRC/zgeqlf.f
index 94721540c..a27612c64 100644
--- a/lapack-netlib/SRC/zgeqlf.f
+++ b/lapack-netlib/SRC/zgeqlf.f
@@ -88,7 +88,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is
 *>          the optimal blocksize.
 *>
@@ -113,7 +114,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup geqlf
 *
 *> \par Further Details:
 *  =====================
@@ -188,8 +189,9 @@
          END IF
          WORK( 1 ) = LWKOPT
 *
-         IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
-            INFO = -7
+         IF( .NOT.LQUERY ) THEN
+            IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) )
+     $         INFO = -7
          END IF
       END IF
 *
diff --git a/lapack-netlib/SRC/zgeqp3rk.c b/lapack-netlib/SRC/zgeqp3rk.c
new file mode 100644
index 000000000..0c8b41c2d
--- /dev/null
+++ b/lapack-netlib/SRC/zgeqp3rk.c
@@ -0,0 +1,1074 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static integer c__3 = 3;
+static integer c__2 = 2;
+
+/* Subroutine */ int zgeqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*kmax, doublereal *abstol, doublereal *reltol, doublecomplex *a, 
+	integer *lda, integer *k, doublereal *maxc2nrmk, doublereal *
+	relmaxc2nrmk, integer *jpiv, doublecomplex *tau, doublecomplex *work, 
+	integer *lwork, doublereal *rwork, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    doublereal d__1, d__2;
+    doublecomplex z__1;
+
+    /* Local variables */
+    doublereal maxc2nrm;
+    logical done;
+    extern /* Subroutine */ int zlaqp2rk_(integer *, integer *, integer *, 
+	    integer *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, doublecomplex *, integer *, integer *, doublereal *,
+	     doublereal *, integer *, doublecomplex *, doublereal *, 
+	    doublereal *, doublecomplex *, integer *), zlaqp3rk_(integer *, 
+	    integer *, integer *, integer *, integer *, doublereal *, 
+	    doublereal *, integer *, doublereal *, doublecomplex *, integer *,
+	     logical *, integer *, doublereal *, doublereal *, integer *, 
+	    doublecomplex *, doublereal *, doublereal *, doublecomplex *, 
+	    doublecomplex *, integer *, integer *, integer *);
+    integer jmax, j, jmaxc2nrm, jmaxb, nbmin, iinfo, n_sub__, minmn;
+    doublereal myhugeval;
+    integer jb;
+    extern doublereal dznrm2_(integer *, doublecomplex *, integer *);
+    integer nb, kf;
+    extern doublereal dlamch_(char *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    integer nx;
+    doublereal safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern logical disnan_(doublereal *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen);
+    integer kp1, lwkopt;
+    logical lquery;
+    integer jbf;
+    doublereal eps;
+    integer iws, ioffset;
+
+
+/*  -- LAPACK computational routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test input arguments */
+/*     ==================== */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*kmax < 0) {
+	*info = -4;
+    } else if (disnan_(abstol)) {
+	*info = -5;
+    } else if (disnan_(reltol)) {
+	*info = -6;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -8;
+    }
+
+/*     If the input parameters M, N, NRHS, KMAX, LDA are valid: */
+/*       a) Test the input workspace size LWORK for the minimum */
+/*          size requirement IWS. */
+/*       b) Determine the optimal block size NB and optimal */
+/*          workspace size LWKOPT to be returned in WORK(1) */
+/*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., */
+/*          (3) when routine exits. */
+/*     Here, IWS is the miminum workspace required for unblocked */
+/*     code. */
+
+    if (*info == 0) {
+	minmn = f2cmin(*m,*n);
+	if (minmn == 0) {
+	    iws = 1;
+	    lwkopt = 1;
+	} else {
+
+/*           Minimal workspace size in case of using only unblocked */
+/*           BLAS 2 code in ZLAQP2RK. */
+/*           1) ZLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in ZLARF subroutine inside ZLAQP2RK to apply an */
+/*              elementary reflector from the left. */
+/*           TOTAL_WORK_SIZE = 3*N + NRHS - 1 */
+
+	    iws = *n + *nrhs - 1;
+
+/*           Assign to NB optimal block size. */
+
+	    nb = ilaenv_(&c__1, "ZGEQP3RK", " ", m, n, &c_n1, &c_n1, (ftnlen)
+		    8, (ftnlen)1);
+
+/*           A formula for the optimal workspace size in case of using */
+/*           both unblocked BLAS 2 in ZLAQP2RK and blocked BLAS 3 code */
+/*           in ZLAQP3RK. */
+/*           1) ZGEQP3RK, ZLAQP2RK, ZLAQP3RK: 2*N to store full and */
+/*              partial column 2-norms. */
+/*           2) ZLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in ZLARF subroutine to apply an elementary reflector */
+/*              from the left. */
+/*           3) ZLAQP3RK: NB*(N+NRHS) to use in the work array F that */
+/*              is used to apply a block reflector from */
+/*              the left. */
+/*           4) ZLAQP3RK: NB to use in the auxilixary array AUX. */
+/*           Sizes (2) and ((3) + (4)) should intersect, therefore */
+/*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. */
+
+	    lwkopt = (*n << 1) + nb * (*n + *nrhs + 1);
+	}
+	z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+
+	if (*lwork < iws && ! lquery) {
+	    *info = -15;
+	}
+    }
+
+/*      NOTE: The optimal workspace size is returned in WORK(1), if */
+/*            the input parameters M, N, NRHS, KMAX, LDA are valid. */
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEQP3RK", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible for M=0 or N=0. */
+
+    if (minmn == 0) {
+	*k = 0;
+	*maxc2nrmk = 0.;
+	*relmaxc2nrmk = 0.;
+	z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize column pivot array JPIV. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	jpiv[j] = j;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize storage for partial and exact column 2-norms. */
+/*     a) The elements WORK(1:N) are used to store partial column */
+/*        2-norms of the matrix A, and may decrease in each computation */
+/*        step; initialize to the values of complete columns 2-norms. */
+/*     b) The elements WORK(N+1:2*N) are used to store complete column */
+/*        2-norms of the matrix A, they are not changed during the */
+/*        computation; initialize the values of complete columns 2-norms. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	rwork[j] = dznrm2_(m, &a[j * a_dim1 + 1], &c__1);
+	rwork[*n + j] = rwork[j];
+    }
+
+/*     ================================================================== */
+
+/*     Compute the pivot column index and the maximum column 2-norm */
+/*     for the whole original matrix stored in A(1:M,1:N). */
+
+    kp1 = idamax_(n, &rwork[1], &c__1);
+
+/*     ==================================================================. */
+
+    if (disnan_(&maxc2nrm)) {
+
+/*        Check if the matrix A contains NaN, set INFO parameter */
+/*        to the column number where the first NaN is found and return */
+/*        from the routine. */
+
+	*k = 0;
+	*info = kp1;
+
+/*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = maxc2nrm;
+
+/*        Array TAU is not set and contains undefined elements. */
+
+	z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+	return 0;
+    }
+
+/*     =================================================================== */
+
+    if (maxc2nrm == 0.) {
+
+/*        Check is the matrix A is a zero matrix, set array TAU and */
+/*        return from the routine. */
+
+	*k = 0;
+	*maxc2nrmk = 0.;
+	*relmaxc2nrmk = 0.;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j;
+	    tau[i__2].r = 0., tau[i__2].i = 0.;
+	}
+
+	z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+	return 0;
+
+    }
+
+/*     =================================================================== */
+
+    myhugeval = dlamch_("Overflow");
+
+    if (maxc2nrm > myhugeval) {
+
+/*        Check if the matrix A contains +Inf or -Inf, set INFO parameter */
+/*        to the column number, where the first +/-Inf  is found plus N, */
+/*        and continue the computation. */
+
+	*info = *n + kp1;
+
+    }
+
+/*     ================================================================== */
+
+/*     Quick return if possible for the case when the first */
+/*     stopping criterion is satisfied, i.e. KMAX = 0. */
+
+    if (*kmax == 0) {
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.;
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j;
+	    tau[i__2].r = 0., tau[i__2].i = 0.;
+	}
+	z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+    eps = dlamch_("Epsilon");
+
+/*     Adjust ABSTOL */
+
+    if (*abstol >= 0.) {
+	safmin = dlamch_("Safe minimum");
+/* Computing MAX */
+	d__1 = *abstol, d__2 = safmin * 2.;
+	*abstol = f2cmax(d__1,d__2);
+    }
+
+/*     Adjust RELTOL */
+
+    if (*reltol >= 0.) {
+	*reltol = f2cmax(*reltol,eps);
+    }
+
+/*     =================================================================== */
+
+/*     JMAX is the maximum index of the column to be factorized, */
+/*     which is also limited by the first stopping criterion KMAX. */
+
+    jmax = f2cmin(*kmax,minmn);
+
+/*     =================================================================== */
+
+/*     Quick return if possible for the case when the second or third */
+/*     stopping criterion for the whole original matrix is satified, */
+/*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */
+/*     (which is ONE <= RELTOL). */
+
+    if (maxc2nrm <= *abstol || 1. <= *reltol) {
+
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j;
+	    tau[i__2].r = 0., tau[i__2].i = 0.;
+	}
+
+	z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+	return 0;
+    }
+
+/*     ================================================================== */
+/*     Factorize columns */
+/*     ================================================================== */
+
+/*     Determine the block size. */
+
+    nbmin = 2;
+    nx = 0;
+
+    if (nb > 1 && nb < minmn) {
+
+/*        Determine when to cross over from blocked to unblocked code. */
+/*        (for N less than NX, unblocked code should be used). */
+
+/* Computing MAX */
+	i__1 = 0, i__2 = ilaenv_(&c__3, "ZGEQP3RK", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)8, (ftnlen)1);
+	nx = f2cmax(i__1,i__2);
+
+	if (nx < minmn) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    if (*lwork < lwkopt) {
+
+/*              Not enough workspace to use optimal block size that */
+/*              is currently stored in NB. */
+/*              Reduce NB and determine the minimum value of NB. */
+
+		nb = (*lwork - (*n << 1)) / (*n + 1);
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "ZGEQP3RK", " ", m, n, &c_n1, 
+			&c_n1, (ftnlen)8, (ftnlen)1);
+		nbmin = f2cmax(i__1,i__2);
+
+	    }
+	}
+    }
+
+/*     ================================================================== */
+
+/*     DONE is the boolean flag to rerpresent the case when the */
+/*     factorization completed in the block factorization routine, */
+/*     before the end of the block. */
+
+    done = FALSE_;
+
+/*     J is the column index. */
+
+    j = 1;
+
+/*     (1) Use blocked code initially. */
+
+/*     JMAXB is the maximum column index of the block, when the */
+/*     blocked code is used, is also limited by the first stopping */
+/*     criterion KMAX. */
+
+/* Computing MIN */
+    i__1 = *kmax, i__2 = minmn - nx;
+    jmaxb = f2cmin(i__1,i__2);
+
+    if (nb >= nbmin && nb < jmax && jmaxb > 0) {
+
+/*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */
+/*        J   is the column index of a column block; */
+/*        JB  is the column block size to pass to block factorization */
+/*            routine in a loop step; */
+/*        JBF is the number of columns that were actually factorized */
+/*            that was returned by the block factorization routine */
+/*            in a loop step, JBF <= JB; */
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	while(j <= jmaxb) {
+
+/* Computing MIN */
+	    i__1 = nb, i__2 = jmaxb - j + 1;
+	    jb = f2cmin(i__1,i__2);
+	    n_sub__ = *n - j + 1;
+	    ioffset = j - 1;
+
+/*           Factorize JB columns among the columns A(J:N). */
+
+	    i__1 = *n + *nrhs - j + 1;
+	    zlaqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, 
+		    &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, 
+		    maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], &
+		    rwork[*n + j], &work[1], &work[jb + 1], &i__1, &iwork[1], 
+		    &iinfo);
+
+/*           Set INFO on the first occurence of Inf. */
+
+	    if (iinfo > n_sub__ && *info == 0) {
+		*info = (ioffset << 1) + iinfo;
+	    }
+
+	    if (done) {
+
+/*              Either the submatrix is zero before the end of the */
+/*              column block, or ABSTOL or RELTOL criterion is */
+/*              satisfied before the end of the column block, we can */
+/*              return from the routine. Perform the following before */
+/*              returning: */
+/*                a) Set the number of factorized columns K, */
+/*                   K = IOFFSET + JBF from the last call of blocked */
+/*                   routine. */
+/*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */
+/*                         by the block factorization routine; */
+/*                      2) The remaining TAUs are set to ZERO by the */
+/*                         block factorization routine. */
+
+		*k = ioffset + jbf;
+
+/*              Set INFO on the first occurrence of NaN, NaN takes */
+/*              prcedence over Inf. */
+
+		if (iinfo <= n_sub__ && iinfo > 0) {
+		    *info = ioffset + iinfo;
+		}
+
+/*              Return from the routine. */
+
+		z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+		work[1].r = z__1.r, work[1].i = z__1.i;
+
+		return 0;
+
+	    }
+
+	    j += jbf;
+
+	}
+
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+/*     J = JMAX+1 means we factorized the maximum possible number of */
+/*     columns, that is in ELSE clause we need to compute */
+/*     the MAXC2NORM and RELMAXC2NORM to return after we processed */
+/*     the blocks. */
+
+    if (j <= jmax) {
+
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	n_sub__ = *n - j + 1;
+	ioffset = j - 1;
+
+	i__1 = jmax - j + 1;
+	zlaqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, &
+		maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, 
+		relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], &rwork[*n + j], &
+		work[1], &iinfo);
+
+/*        ABSTOL or RELTOL criterion is satisfied when the number of */
+/*        the factorized columns KF is smaller then the  number */
+/*        of columns JMAX-J+1 supplied to be factorized by the */
+/*        unblocked routine, we can return from */
+/*        the routine. Perform the following before returning: */
+/*           a) Set the number of factorized columns K, */
+/*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the */
+/*              unblocked factorization routine above. */
+
+	*k = j - 1 + kf;
+
+/*        Set INFO on the first exception occurence. */
+
+/*        Set INFO on the first exception occurence of Inf or NaN, */
+/*        (NaN takes precedence over Inf). */
+
+	if (iinfo > n_sub__ && *info == 0) {
+	    *info = (ioffset << 1) + iinfo;
+	} else if (iinfo <= n_sub__ && iinfo > 0) {
+	    *info = ioffset + iinfo;
+	}
+
+    } else {
+
+/*        Compute the return values for blocked code. */
+
+/*        Set the number of factorized columns if the unblocked routine */
+/*        was not called. */
+
+	*k = jmax;
+
+/*        If there exits a residual matrix after the blocked code: */
+/*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */
+/*              residual matrix, otherwise set them to ZERO; */
+/*           2) Set TAU(K+1:MINMN) to ZERO. */
+
+	if (*k < minmn) {
+	    i__1 = *n - *k;
+	    jmaxc2nrm = *k + idamax_(&i__1, &rwork[*k + 1], &c__1);
+	    *maxc2nrmk = rwork[jmaxc2nrm];
+	    if (*k == 0) {
+		*relmaxc2nrmk = 1.;
+	    } else {
+		*relmaxc2nrmk = *maxc2nrmk / maxc2nrm;
+	    }
+
+	    i__1 = minmn;
+	    for (j = *k + 1; j <= i__1; ++j) {
+		i__2 = j;
+		tau[i__2].r = 0., tau[i__2].i = 0.;
+	    }
+
+	} else {
+	    *maxc2nrmk = 0.;
+	    *relmaxc2nrmk = 0.;
+
+	}
+
+/*     END IF( J.LE.JMAX ) THEN */
+
+    }
+
+    z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+    work[1].r = z__1.r, work[1].i = z__1.i;
+
+    return 0;
+
+/*     End of ZGEQP3RK */
+
+} /* zgeqp3rk_ */
+
diff --git a/lapack-netlib/SRC/zgeqp3rk.f b/lapack-netlib/SRC/zgeqp3rk.f
new file mode 100644
index 000000000..01dcce0de
--- /dev/null
+++ b/lapack-netlib/SRC/zgeqp3rk.f
@@ -0,0 +1,1092 @@
+*> \brief \b ZGEQP3RK computes a truncated Householder QR factorization with column pivoting of a complex m-by-n matrix A by using Level 3 BLAS and overwrites m-by-nrhs matrix B with Q**H * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZGEQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgeqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgeqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgeqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+*      $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*      $                     WORK, LWORK, RWORK, IWORK, INFO )
+*       IMPLICIT NONE
+*
+*      .. Scalar Arguments ..
+*       INTEGER            INFO, K, KMAX, LDA, LWORK, M, N, NRHS
+*       DOUBLE PRECISION   ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*      ..
+*      .. Array Arguments ..
+*       INTEGER            IWORK( * ), JPIV( * )
+*       DOUBLE PRECISION   RWORK( * )
+*       COMPLEX*16         A( LDA, * ), TAU( * ), WORK( * )
+*      ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZGEQP3RK performs two tasks simultaneously:
+*>
+*> Task 1: The routine computes a truncated (rank K) or full rank
+*> Householder QR factorization with column pivoting of a complex
+*> M-by-N matrix A using Level 3 BLAS. K is the number of columns
+*> that were factorized, i.e. factorization rank of the
+*> factor R, K <= min(M,N).
+*>
+*>  A * P(K) = Q(K) * R(K)  =
+*>
+*>        = Q(K) * ( R11(K) R12(K) ) = Q(K) * (   R(K)_approx    )
+*>                 ( 0      R22(K) )          ( 0  R(K)_residual ),
+*>
+*> where:
+*>
+*>  P(K)            is an N-by-N permutation matrix;
+*>  Q(K)            is an M-by-M unitary matrix;
+*>  R(K)_approx   = ( R11(K), R12(K) ) is a rank K approximation of the
+*>                    full rank factor R with K-by-K upper-triangular
+*>                    R11(K) and K-by-N rectangular R12(K). The diagonal
+*>                    entries of R11(K) appear in non-increasing order
+*>                    of absolute value, and absolute values of all of
+*>                    them exceed the maximum column 2-norm of R22(K)
+*>                    up to roundoff error.
+*>  R(K)_residual = R22(K) is the residual of a rank K approximation
+*>                    of the full rank factor R. It is a
+*>                    an (M-K)-by-(N-K) rectangular matrix;
+*>  0               is a an (M-K)-by-K zero matrix.
+*>
+*> Task 2: At the same time, the routine overwrites a complex M-by-NRHS
+*> matrix B with  Q(K)**H * B  using Level 3 BLAS.
+*>
+*> =====================================================================
+*>
+*> The matrices A and B are stored on input in the array A as
+*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS)
+*> respectively.
+*>
+*>                                  N     NRHS
+*>             array_A   =   M  [ mat_A, mat_B ]
+*>
+*> The truncation criteria (i.e. when to stop the factorization)
+*> can be any of the following:
+*>
+*>   1) The input parameter KMAX, the maximum number of columns
+*>      KMAX to factorize, i.e. the factorization rank is limited
+*>      to KMAX. If KMAX >= min(M,N), the criterion is not used.
+*>
+*>   2) The input parameter ABSTOL, the absolute tolerance for
+*>      the maximum column 2-norm of the residual matrix R22(K). This
+*>      means that the factorization stops if this norm is less or
+*>      equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used.
+*>
+*>   3) The input parameter RELTOL, the tolerance for the maximum
+*>      column 2-norm matrix of the residual matrix R22(K) divided
+*>      by the maximum column 2-norm of the original matrix A, which
+*>      is equal to abs(R(1,1)). This means that the factorization stops
+*>      when the ratio of the maximum column 2-norm of R22(K) to
+*>      the maximum column 2-norm of A is less than or equal to RELTOL.
+*>      If RELTOL < 0.0, the criterion is not used.
+*>
+*>   4) In case both stopping criteria ABSTOL or RELTOL are not used,
+*>      and when the residual matrix R22(K) is a zero matrix in some
+*>      factorization step K. ( This stopping criterion is implicit. )
+*>
+*>  The algorithm stops when any of these conditions is first
+*>  satisfied, otherwise the whole matrix A is factorized.
+*>
+*>  To factorize the whole matrix A, use the values
+*>  KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0.
+*>
+*>  The routine returns:
+*>     a) Q(K), R(K)_approx = ( R11(K), R12(K) ),
+*>        R(K)_residual = R22(K), P(K), i.e. the resulting matrices
+*>        of the factorization; P(K) is represented by JPIV,
+*>        ( if K = min(M,N), R(K)_approx is the full factor R,
+*>        and there is no residual matrix R(K)_residual);
+*>     b) K, the number of columns that were factorized,
+*>        i.e. factorization rank;
+*>     c) MAXC2NRMK, the maximum column 2-norm of the residual
+*>        matrix R(K)_residual = R22(K),
+*>        ( if K = min(M,N), MAXC2NRMK = 0.0 );
+*>     d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum
+*>        column 2-norm of the original matrix A, which is equal
+*>        to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 );
+*>     e) Q(K)**H * B, the matrix B with the unitary
+*>        transformation Q(K)**H applied on the left.
+*>
+*> The N-by-N permutation matrix P(K) is stored in a compact form in
+*> the integer array JPIV. For 1 <= j <= N, column j
+*> of the matrix A was interchanged with column JPIV(j).
+*>
+*> The M-by-M unitary matrix Q is represented as a product
+*> of elementary Householder reflectors
+*>
+*>     Q(K) = H(1) *  H(2) * . . . * H(K),
+*>
+*> where K is the number of columns that were factorized.
+*>
+*> Each H(j) has the form
+*>
+*>     H(j) = I - tau * v * v**H,
+*>
+*> where 1 <= j <= K and
+*>   I    is an M-by-M identity matrix,
+*>   tau  is a complex scalar,
+*>   v    is a complex vector with v(1:j-1) = 0 and v(j) = 1.
+*>
+*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j).
+*>
+*> See the Further Details section for more information.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e. the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M,N), then this stopping criterion
+*>                is not used, the routine factorizes columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>                satisfied on input and the routine exits immediately.
+*>                This means that the factorization is not performed,
+*>                the matrices A and B are not modified, and
+*>                the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION
+*>
+*>          The second factorization stopping criterion, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix R22(K).
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix R22(K)
+*>          is less than or equal to ABSTOL. Let SAFMIN = DLAMCH('S').
+*>
+*>          a) If ABSTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -5 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN
+*>                is used. This includes the case ABSTOL = -0.0.
+*>
+*>          d) If 2*SAFMIN <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If ABSTOL chosen above is >= MAXC2NRM, then this
+*>          stopping criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed. The routine
+*>          returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case ABSTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION
+*>
+*>          The third factorization stopping criterion, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio
+*>          abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of
+*>          the residual matrix R22(K) to the maximum column 2-norm of
+*>          the original matrix A. The algorithm converges (stops the
+*>          factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less
+*>          than or equal to RELTOL. Let EPS = DLAMCH('E').
+*>
+*>          a) If RELTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -6 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used.
+*>                This includes the case RELTOL = -0.0.
+*>
+*>          d) If EPS <= RELTOL then the input value of RELTOL
+*>                is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If RELTOL chosen above is >= 1.0, then this stopping
+*>          criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed.
+*>          The routine returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case RELTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*>
+*>          NOTE: We recommend that RELTOL satisfy
+*>                min( 10*max(M,N)*EPS, sqrt(EPS) ) <= RELTOL
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N+NRHS)
+*>
+*>          On entry:
+*>
+*>          a) The subarray A(1:M,1:N) contains the M-by-N matrix A.
+*>          b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS
+*>             matrix B.
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>
+*>          a) The subarray A(1:M,1:N) contains parts of the factors
+*>             of the matrix A:
+*>
+*>            1) If K = 0, A(1:M,1:N) contains the original matrix A.
+*>            2) If K > 0, A(1:M,1:N) contains parts of the
+*>            factors:
+*>
+*>              1. The elements below the diagonal of the subarray
+*>                 A(1:M,1:K) together with TAU(1:K) represent the
+*>                 unitary matrix Q(K) as a product of K Householder
+*>                 elementary reflectors.
+*>
+*>              2. The elements on and above the diagonal of
+*>                 the subarray A(1:K,1:N) contain K-by-N
+*>                 upper-trapezoidal matrix
+*>                 R(K)_approx = ( R11(K), R12(K) ).
+*>                 NOTE: If K=min(M,N), i.e. full rank factorization,
+*>                       then R_approx(K) is the full factor R which
+*>                       is upper-trapezoidal. If, in addition, M>=N,
+*>                       then R is upper-triangular.
+*>
+*>              3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K)
+*>                 rectangular matrix R(K)_residual = R22(K).
+*>
+*>          b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains
+*>             the M-by-NRHS product Q(K)**H * B.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*>          This is the leading dimension for both matrices, A and B.
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*>
+*>          NOTE: If K = 0, a) the arrays A and B are not modified;
+*>                          b) the array TAU(1:min(M,N)) is set to ZERO,
+*>                             if the matrix A does not contain NaN,
+*>                             otherwise the elements TAU(1:min(M,N))
+*>                             are undefined;
+*>                          c) the elements of the array JPIV are set
+*>                             as follows: for j = 1:N, JPIV(j) = j.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix R22(K),
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then MAXC2NRMK equals the maximum column 2-norm
+*>             of the original matrix A.
+*>
+*>          b) If 0 < K < min(M,N), then MAXC2NRMK is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then MAXC2NRMK = 0.0.
+*>
+*>          NOTE: MAXC2NRMK in the factorization step K would equal
+*>                R(K+1,K+1) in the next factorization step K+1.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix R22(K) (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then RELMAXC2NRMK = 1.0.
+*>
+*>          b) If 0 < K < min(M,N), then
+*>                RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then RELMAXC2NRMK = 0.0.
+*>
+*>         NOTE: RELMAXC2NRMK in the factorization step K would equal
+*>               abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization
+*>               step K+1.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices. For 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*>
+*>          The elements of the array JPIV(1:N) are always set
+*>          by the routine, for example, even  when no columns
+*>          were factorized, i.e. when K = 0, the elements are
+*>          set as JPIV(j) = j for j = 1:N.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX*16 array, dimension (min(M,N))
+*>          The scalar factors of the elementary reflectors.
+*>
+*>          If 0 < K <= min(M,N), only the elements TAU(1:K) of
+*>          the array TAU are modified by the factorization.
+*>          After the factorization computed, if no NaN was found
+*>          during the factorization, the remaining elements
+*>          TAU(K+1:min(M,N)) are set to zero, otherwise the
+*>          elements TAU(K+1:min(M,N)) are not set and therefore
+*>          undefined.
+*>          ( If K = 0, all elements of TAU are set to zero, if
+*>          the matrix A does not contain NaN. )
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and
+*>          LWORK >= N+NRHS-1, otherwise.
+*>          For optimal performance LWORK >= NB*( N+NRHS+1 ),
+*>          where NB is the optimal block size for ZGEQP3RK returned
+*>          by ILAENV. Minimal block size MINNB=2.
+*>
+*>          NOTE: The decision, whether to use unblocked BLAS 2
+*>          or blocked BLAS 3 code is based not only on the dimension
+*>          LWORK of the availbale workspace WORK, but also also on the
+*>          matrix A dimension N via crossover point NX returned
+*>          by ILAENV. (For N less than NX, unblocked code should be
+*>          used.)
+*>
+*>          If LWORK = -1, then a workspace query is assumed;
+*>          the routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (2*N)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix in the blocked step auxiliary subroutine ZLAQP3RK ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) INFO < 0: if INFO = -i, the i-th argument had an
+*>                       illegal value.
+*>          3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup geqp3rk
+*
+*> \par Further Details:
+*  =====================
+*
+*> \verbatim
+*> ZGEQP3RK is based on the same BLAS3 Householder QR factorization
+*> algorithm with column pivoting as in ZGEQP3 routine which uses
+*> ZLARFG routine to generate Householder reflectors
+*> for QR factorization.
+*>
+*> We can also write:
+*>
+*>   A = A_approx(K) + A_residual(K)
+*>
+*> The low rank approximation matrix A(K)_approx from
+*> the truncated QR factorization of rank K of the matrix A is:
+*>
+*>   A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T
+*>                        (     0     0 )
+*>
+*>               = Q(K) * ( R11(K) R12(K) ) * P(K)**T
+*>                        (      0      0 )
+*>
+*> The residual A_residual(K) of the matrix A is:
+*>
+*>   A_residual(K) = Q(K) * ( 0              0 ) * P(K)**T =
+*>                          ( 0  R(K)_residual )
+*>
+*>                 = Q(K) * ( 0        0 ) * P(K)**T
+*>                          ( 0   R22(K) )
+*>
+*> The truncated (rank K) factorization guarantees that
+*> the maximum column 2-norm of A_residual(K) is less than
+*> or equal to MAXC2NRMK up to roundoff error.
+*>
+*> NOTE: An approximation of the null vectors
+*>       of A can be easily computed from R11(K)
+*>       and R12(K):
+*>
+*>       Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) )
+*>                                 (         -I           )
+*>
+*> \endverbatim
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+     $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     WORK, LWORK, RWORK, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS
+      DOUBLE PRECISION   ABSTOL,  MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      DOUBLE PRECISION   RWORK( * )
+      COMPLEX*16         A( LDA, * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            INB, INBMIN, IXOVER
+      PARAMETER          ( INB = 1, INBMIN = 2, IXOVER = 3 )
+      DOUBLE PRECISION   ZERO, ONE, TWO
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 )
+      COMPLEX*16         CZERO
+      PARAMETER          ( CZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, DONE
+      INTEGER            IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX,
+     $                   JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB,
+     $                   NBMIN, NX
+      DOUBLE PRECISION   EPS, HUGEVAL, MAXC2NRM, SAFMIN
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZLAQP2RK, ZLAQP3RK, XERBLA
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      INTEGER            IDAMAX, ILAENV
+      DOUBLE PRECISION   DLAMCH, DZNRM2
+      EXTERNAL           DISNAN, DLAMCH, DZNRM2, IDAMAX, ILAENV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCMPLX, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test input arguments
+*     ====================
+*
+      INFO = 0
+      LQUERY = ( LWORK.EQ.-1 )
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KMAX.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( DISNAN( ABSTOL ) ) THEN
+         INFO = -5
+      ELSE IF( DISNAN( RELTOL ) ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -8
+      END IF
+*
+*     If the input parameters M, N, NRHS, KMAX, LDA are valid:
+*       a) Test the input workspace size LWORK for the minimum
+*          size requirement IWS.
+*       b) Determine the optimal block size NB and optimal
+*          workspace size LWKOPT to be returned in WORK(1)
+*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE.,
+*          (3) when routine exits.
+*     Here, IWS is the miminum workspace required for unblocked
+*     code.
+*
+      IF( INFO.EQ.0 ) THEN
+         MINMN = MIN( M, N )
+         IF( MINMN.EQ.0 ) THEN
+            IWS = 1
+            LWKOPT = 1
+         ELSE
+*
+*           Minimal workspace size in case of using only unblocked
+*           BLAS 2 code in ZLAQP2RK.
+*           1) ZLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in ZLARF subroutine inside ZLAQP2RK to apply an
+*              elementary reflector from the left.
+*           TOTAL_WORK_SIZE = 3*N + NRHS - 1
+*
+            IWS = N + NRHS - 1
+*
+*           Assign to NB optimal block size.
+*
+            NB = ILAENV( INB, 'ZGEQP3RK', ' ', M, N, -1, -1 )
+*
+*           A formula for the optimal workspace size in case of using
+*           both unblocked BLAS 2 in ZLAQP2RK and blocked BLAS 3 code
+*           in ZLAQP3RK.
+*           1) ZGEQP3RK, ZLAQP2RK, ZLAQP3RK: 2*N to store full and
+*              partial column 2-norms.
+*           2) ZLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in ZLARF subroutine to apply an elementary reflector
+*              from the left.
+*           3) ZLAQP3RK: NB*(N+NRHS) to use in the work array F that
+*              is used to apply a block reflector from
+*              the left.
+*           4) ZLAQP3RK: NB to use in the auxilixary array AUX.
+*           Sizes (2) and ((3) + (4)) should intersect, therefore
+*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2.
+*
+            LWKOPT = 2*N + NB*( N+NRHS+1 )
+         END IF
+         WORK( 1 ) = DCMPLX( LWKOPT )
+*
+         IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+*      NOTE: The optimal workspace size is returned in WORK(1), if
+*            the input parameters M, N, NRHS, KMAX, LDA are valid.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZGEQP3RK', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible for M=0 or N=0.
+*
+      IF( MINMN.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+         WORK( 1 ) = DCMPLX( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+*     Initialize column pivot array JPIV.
+*
+      DO J = 1, N
+         JPIV( J ) = J
+      END DO
+*
+*     ==================================================================
+*
+*     Initialize storage for partial and exact column 2-norms.
+*     a) The elements WORK(1:N) are used to store partial column
+*        2-norms of the matrix A, and may decrease in each computation
+*        step; initialize to the values of complete columns 2-norms.
+*     b) The elements WORK(N+1:2*N) are used to store complete column
+*        2-norms of the matrix A, they are not changed during the
+*        computation; initialize the values of complete columns 2-norms.
+*
+      DO J = 1, N
+         RWORK( J ) = DZNRM2( M, A( 1, J ), 1 )
+         RWORK( N+J ) = RWORK( J )
+      END DO
+*
+*     ==================================================================
+*
+*     Compute the pivot column index and the maximum column 2-norm
+*     for the whole original matrix stored in A(1:M,1:N).
+*
+      KP1 = IDAMAX( N, RWORK( 1 ), 1 )
+*
+*     ==================================================================.
+*
+      IF( DISNAN( MAXC2NRM ) ) THEN
+*
+*        Check if the matrix A contains NaN, set INFO parameter
+*        to the column number where the first NaN is found and return
+*        from the routine.
+*
+         K = 0
+         INFO = KP1
+*
+*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = MAXC2NRM
+*
+*        Array TAU is not set and contains undefined elements.
+*
+         WORK( 1 ) = DCMPLX( LWKOPT )
+         RETURN
+      END IF
+*
+*     ===================================================================
+*
+      IF( MAXC2NRM.EQ.ZERO ) THEN
+*
+*        Check is the matrix A is a zero matrix, set array TAU and
+*        return from the routine.
+*
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+*
+         DO J = 1, MINMN
+            TAU( J ) = CZERO
+         END DO
+*
+         WORK( 1 ) = DCMPLX( LWKOPT )
+         RETURN
+*
+      END IF
+*
+*     ===================================================================
+*
+      HUGEVAL = DLAMCH( 'Overflow' )
+*
+      IF( MAXC2NRM.GT.HUGEVAL ) THEN
+*
+*        Check if the matrix A contains +Inf or -Inf, set INFO parameter
+*        to the column number, where the first +/-Inf  is found plus N,
+*        and continue the computation.
+*
+         INFO = N + KP1
+*
+      END IF
+*
+*     ==================================================================
+*
+*     Quick return if possible for the case when the first
+*     stopping criterion is satisfied, i.e. KMAX = 0.
+*
+      IF( KMAX.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+         DO J = 1, MINMN
+            TAU( J ) = CZERO
+         END DO
+         WORK( 1 ) = DCMPLX( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+      EPS = DLAMCH('Epsilon')
+*
+*     Adjust ABSTOL
+*
+      IF( ABSTOL.GE.ZERO ) THEN
+         SAFMIN = DLAMCH('Safe minimum')
+         ABSTOL = MAX( ABSTOL, TWO*SAFMIN )
+      END IF
+*
+*     Adjust RELTOL
+*
+      IF( RELTOL.GE.ZERO ) THEN
+         RELTOL = MAX( RELTOL, EPS )
+      END IF
+*
+*     ===================================================================
+*
+*     JMAX is the maximum index of the column to be factorized,
+*     which is also limited by the first stopping criterion KMAX.
+*
+      JMAX = MIN( KMAX, MINMN )
+*
+*     ===================================================================
+*
+*     Quick return if possible for the case when the second or third
+*     stopping criterion for the whole original matrix is satified,
+*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL
+*     (which is ONE <= RELTOL).
+*
+      IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN
+*
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+*
+         DO J = 1, MINMN
+            TAU( J ) = CZERO
+         END DO
+*
+         WORK( 1 ) = DCMPLX( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*     Factorize columns
+*     ==================================================================
+*
+*     Determine the block size.
+*
+      NBMIN = 2
+      NX = 0
+*
+      IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN
+*
+*        Determine when to cross over from blocked to unblocked code.
+*        (for N less than NX, unblocked code should be used).
+*
+         NX = MAX( 0, ILAENV( IXOVER, 'ZGEQP3RK', ' ', M, N, -1, -1 ) )
+*
+         IF( NX.LT.MINMN ) THEN
+*
+*           Determine if workspace is large enough for blocked code.
+*
+            IF( LWORK.LT.LWKOPT ) THEN
+*
+*              Not enough workspace to use optimal block size that
+*              is currently stored in NB.
+*              Reduce NB and determine the minimum value of NB.
+*
+               NB = ( LWORK-2*N ) / ( N+1 )
+               NBMIN = MAX( 2, ILAENV( INBMIN, 'ZGEQP3RK', ' ', M, N,
+     $                 -1, -1 ) )
+*
+            END IF
+         END IF
+      END IF
+*
+*     ==================================================================
+*
+*     DONE is the boolean flag to rerpresent the case when the
+*     factorization completed in the block factorization routine,
+*     before the end of the block.
+*
+      DONE = .FALSE.
+*
+*     J is the column index.
+*
+      J = 1
+*
+*     (1) Use blocked code initially.
+*
+*     JMAXB is the maximum column index of the block, when the
+*     blocked code is used, is also limited by the first stopping
+*     criterion KMAX.
+*
+      JMAXB = MIN( KMAX, MINMN - NX )
+*
+      IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN
+*
+*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here:
+*        J   is the column index of a column block;
+*        JB  is the column block size to pass to block factorization
+*            routine in a loop step;
+*        JBF is the number of columns that were actually factorized
+*            that was returned by the block factorization routine
+*            in a loop step, JBF <= JB;
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         DO WHILE( J.LE.JMAXB )
+*
+            JB = MIN( NB, JMAXB-J+1 )
+            N_SUB = N-J+1
+            IOFFSET = J-1
+*
+*           Factorize JB columns among the columns A(J:N).
+*
+            CALL ZLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                     DONE, JBF, MAXC2NRMK, RELMAXC2NRMK,
+     $                     JPIV( J ), TAU( J ),
+     $                     RWORK( J ), RWORK( N+J ),
+     $                     WORK( 1 ), WORK( JB+1 ),
+     $                     N+NRHS-J+1, IWORK, IINFO )
+*
+*           Set INFO on the first occurence of Inf.
+*
+            IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+               INFO = 2*IOFFSET + IINFO
+            END IF
+*
+            IF( DONE ) THEN
+*
+*              Either the submatrix is zero before the end of the
+*              column block, or ABSTOL or RELTOL criterion is
+*              satisfied before the end of the column block, we can
+*              return from the routine. Perform the following before
+*              returning:
+*                a) Set the number of factorized columns K,
+*                   K = IOFFSET + JBF from the last call of blocked
+*                   routine.
+*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned
+*                         by the block factorization routine;
+*                      2) The remaining TAUs are set to ZERO by the
+*                         block factorization routine.
+*
+               K = IOFFSET + JBF
+*
+*              Set INFO on the first occurrence of NaN, NaN takes
+*              prcedence over Inf.
+*
+               IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+                  INFO = IOFFSET + IINFO
+               END IF
+*
+*              Return from the routine.
+*
+               WORK( 1 ) = DCMPLX( LWKOPT )
+*
+               RETURN
+*
+            END IF
+*
+            J = J + JBF
+*
+         END DO
+*
+      END IF
+*
+*     Use unblocked code to factor the last or only block.
+*     J = JMAX+1 means we factorized the maximum possible number of
+*     columns, that is in ELSE clause we need to compute
+*     the MAXC2NORM and RELMAXC2NORM to return after we processed
+*     the blocks.
+*
+      IF( J.LE.JMAX ) THEN
+*
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         N_SUB = N-J+1
+         IOFFSET = J-1
+*
+         CALL ZLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1,
+     $                  ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                  KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ),
+     $                  TAU( J ), RWORK( J ), RWORK( N+J ),
+     $                  WORK( 1 ), IINFO )
+*
+*        ABSTOL or RELTOL criterion is satisfied when the number of
+*        the factorized columns KF is smaller then the  number
+*        of columns JMAX-J+1 supplied to be factorized by the
+*        unblocked routine, we can return from
+*        the routine. Perform the following before returning:
+*           a) Set the number of factorized columns K,
+*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the
+*              unblocked factorization routine above.
+*
+         K = J - 1 + KF
+*
+*        Set INFO on the first exception occurence.
+*
+*        Set INFO on the first exception occurence of Inf or NaN,
+*        (NaN takes precedence over Inf).
+*
+         IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+            INFO = 2*IOFFSET + IINFO
+         ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+            INFO = IOFFSET + IINFO
+         END IF
+*
+      ELSE
+*
+*        Compute the return values for blocked code.
+*
+*        Set the number of factorized columns if the unblocked routine
+*        was not called.
+*
+            K = JMAX
+*
+*        If there exits a residual matrix after the blocked code:
+*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the
+*              residual matrix, otherwise set them to ZERO;
+*           2) Set TAU(K+1:MINMN) to ZERO.
+*
+         IF( K.LT.MINMN ) THEN
+            JMAXC2NRM = K + IDAMAX( N-K, RWORK( K+1 ), 1 )
+            MAXC2NRMK = RWORK( JMAXC2NRM )
+            IF( K.EQ.0 ) THEN
+               RELMAXC2NRMK = ONE
+            ELSE
+               RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+            END IF
+*
+            DO J = K + 1, MINMN
+               TAU( J ) = CZERO
+            END DO
+*
+         ELSE
+            MAXC2NRMK = ZERO
+            RELMAXC2NRMK = ZERO
+*
+         END IF
+*
+*     END IF( J.LE.JMAX ) THEN
+*
+      END IF
+*
+      WORK( 1 ) = DCMPLX( LWKOPT )
+*
+      RETURN
+*
+*     End of ZGEQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/zgeqr.f b/lapack-netlib/SRC/zgeqr.f
index 20a80d083..7df9c2403 100644
--- a/lapack-netlib/SRC/zgeqr.f
+++ b/lapack-netlib/SRC/zgeqr.f
@@ -99,7 +99,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -168,6 +168,8 @@
 *>
 *> \endverbatim
 *>
+*> \ingroup geqr
+*>
 *  =====================================================================
       SUBROUTINE ZGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
@@ -188,7 +190,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, LMINWS, MINT, MINW
-      INTEGER            MB, NB, MINTSZ, NBLCKS
+      INTEGER            MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -244,8 +246,10 @@
 *
 *     Determine if the workspace size satisfies minimal size
 *
+      LWMIN = MAX( 1, N )
+      LWREQ = MAX( 1, N*NB )
       LMINWS = .FALSE.
-      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N )
+      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ )
      $    .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ )
      $    .AND. ( .NOT.LQUERY ) ) THEN
         IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN
@@ -253,7 +257,7 @@
           NB = 1
           MB = M
         END IF
-        IF( LWORK.LT.NB*N ) THEN
+        IF( LWORK.LT.LWREQ ) THEN
           LMINWS = .TRUE.
           NB = 1
         END IF
@@ -268,7 +272,7 @@
       ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 )
      $   .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -6
-      ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY )
+      ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY )
      $   .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -8
       END IF
@@ -282,9 +286,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = MAX( 1, N )
+          WORK( 1 ) = LWMIN
         ELSE
-          WORK( 1 ) = MAX( 1, NB*N )
+          WORK( 1 ) = LWREQ
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -309,7 +313,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = MAX( 1, NB*N )
+      WORK( 1 ) = LWREQ
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/zgeqrfp.f b/lapack-netlib/SRC/zgeqrfp.f
index 73bcde667..3562de36e 100644
--- a/lapack-netlib/SRC/zgeqrfp.f
+++ b/lapack-netlib/SRC/zgeqrfp.f
@@ -97,7 +97,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is
 *>          the optimal blocksize.
 *>
@@ -122,7 +123,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup geqrfp
 *
 *> \par Further Details:
 *  =====================
@@ -162,8 +163,8 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB,
-     $                   NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, ZGEQR2P, ZLARFB, ZLARFT
@@ -181,8 +182,16 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'ZGEQRF', ' ', M, N, -1, -1 )
-      LWKOPT = N*NB
+      K = MIN( M, N )
+      IF( K.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = N
+         LWKOPT = N*NB
+      END IF
       WORK( 1 ) = LWKOPT
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -190,7 +199,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
@@ -202,7 +211,6 @@
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -210,7 +218,7 @@
 *
       NBMIN = 2
       NX = 0
-      IWS = N
+      IWS = LWKMIN
       IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
 *        Determine when to cross over from blocked to unblocked code.
diff --git a/lapack-netlib/SRC/zgesvj.f b/lapack-netlib/SRC/zgesvj.f
index 6cd2335f2..2be45d826 100644
--- a/lapack-netlib/SRC/zgesvj.f
+++ b/lapack-netlib/SRC/zgesvj.f
@@ -200,23 +200,25 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of the array V, LDV >= 1.
-*>          If JOBV = 'V', then LDV >= max(1,N).
-*>          If JOBV = 'A', then LDV >= max(1,MV) .
+*>          If JOBV = 'V', then LDV >= MAX(1,N).
+*>          If JOBV = 'A', then LDV >= MAX(1,MV) .
 *> \endverbatim
 *>
 *> \param[in,out] CWORK
 *> \verbatim
-*>          CWORK is COMPLEX*16 array, dimension (max(1,LWORK))
+*>          CWORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
 *>          Used as workspace.
-*>          If on entry LWORK = -1, then a workspace query is assumed and
-*>          no computation is done; CWORK(1) is set to the minial (and optimal)
-*>          length of CWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER.
-*>          Length of CWORK, LWORK >= M+N.
+*>          Length of CWORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M+N, otherwise.
+*>
+*>          If on entry LWORK = -1, then a workspace query is assumed and
+*>          no computation is done; CWORK(1) is set to the minial (and optimal)
+*>          length of CWORK.
 *> \endverbatim
 *>
 *> \param[in,out] RWORK
@@ -247,15 +249,17 @@
 *>          RWORK(6) = the largest absolute value over all sines of the
 *>                    Jacobi rotation angles in the last sweep. It can be
 *>                    useful for a post festum analysis.
-*>         If on entry LRWORK = -1, then a workspace query is assumed and
-*>         no computation is done; RWORK(1) is set to the minial (and optimal)
-*>         length of RWORK.
 *> \endverbatim
 *>
 *> \param[in] LRWORK
 *> \verbatim
 *>         LRWORK is INTEGER
-*>         Length of RWORK, LRWORK >= MAX(6,N).
+*>         Length of RWORK.
+*>         LRWORK >= 1, if MIN(M,N) = 0, and LRWORK >= MAX(6,N), otherwise.
+*>
+*>         If on entry LRWORK = -1, then a workspace query is assumed and
+*>         no computation is done; RWORK(1) is set to the minial (and optimal)
+*>         length of RWORK.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -276,7 +280,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup gesvj
 *
 *> \par Further Details:
 *  =====================
@@ -367,23 +371,25 @@
 *
 *     .. Local Parameters ..
       DOUBLE PRECISION   ZERO,         HALF,         ONE
-      PARAMETER  ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0)
-      COMPLEX*16      CZERO,                  CONE
-      PARAMETER  ( CZERO = (0.0D0, 0.0D0), CONE = (1.0D0, 0.0D0) )
-      INTEGER      NSWEEP
-      PARAMETER  ( NSWEEP = 30 )
+      PARAMETER        ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0)
+      COMPLEX*16         CZERO,                  CONE
+      PARAMETER        ( CZERO = (0.0D0, 0.0D0), CONE = (1.0D0, 0.0D0) )
+      INTEGER            NSWEEP
+      PARAMETER        ( NSWEEP = 30 )
 *     ..
 *     .. Local Scalars ..
-      COMPLEX*16 AAPQ, OMPQ
-      DOUBLE PRECISION AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG,
-     $       BIGTHETA, CS, CTOL, EPSLN, MXAAPQ,
-     $       MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL,
-     $       SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL
-      INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1,
-     $        ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34,
-     $        N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND
-      LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK,
-     $        RSVEC, UCTOL, UPPER
+      COMPLEX*16         AAPQ, OMPQ
+      DOUBLE PRECISION   AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG,
+     $                   BIGTHETA, CS, CTOL, EPSLN, MXAAPQ,
+     $                   MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL,
+     $                   SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN,
+     $                   TOL
+      INTEGER            BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1,
+     $                   ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34,
+     $                   N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP,
+     $                   SWBAND, MINMN, LWMIN, LRWMIN
+      LOGICAL            APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE,
+     $                   ROTOK, RSVEC, UCTOL, UPPER
 *     ..
 *     ..
 *     .. Intrinsic Functions ..
@@ -422,7 +428,16 @@
       UPPER = LSAME( JOBA, 'U' )
       LOWER = LSAME( JOBA, 'L' )
 *
-      LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 )
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWMIN  = 1
+         LRWMIN = 1
+      ELSE
+         LWMIN  = M+N
+         LRWMIN = MAX( 6, N )
+      END IF
+*
+      LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 )
       IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN
          INFO = -1
       ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN
@@ -442,9 +457,9 @@
          INFO = -11
       ELSE IF( UCTOL .AND. ( RWORK( 1 ).LE.ONE ) ) THEN
          INFO = -12
-      ELSE IF( ( LWORK.LT.( M+N ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN
          INFO = -13
-      ELSE IF( ( LRWORK.LT.MAX( N, 6 ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LRWORK.LT.LRWMIN .AND. ( .NOT.LQUERY ) ) THEN
          INFO = -15
       ELSE
          INFO = 0
@@ -454,15 +469,15 @@
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZGESVJ', -INFO )
          RETURN
-      ELSE IF ( LQUERY ) THEN
-         CWORK(1) = M + N
-         RWORK(1) = MAX( N, 6 )
+      ELSE IF( LQUERY ) THEN
+         CWORK( 1 ) = LWMIN
+         RWORK( 1 ) = LRWMIN
          RETURN
       END IF
 *
 * #:) Quick return for void matrix
 *
-      IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN
+      IF( MINMN.EQ.0 ) RETURN
 *
 *     Set numerical parameters
 *     The stopping criterion for Jacobi rotations is
diff --git a/lapack-netlib/SRC/zgetri.f b/lapack-netlib/SRC/zgetri.f
index 41782841c..f3806a77c 100644
--- a/lapack-netlib/SRC/zgetri.f
+++ b/lapack-netlib/SRC/zgetri.f
@@ -107,7 +107,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup getri
 *
 *  =====================================================================
       SUBROUTINE ZGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -152,7 +152,7 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'ZGETRI', ' ', N, -1, -1, -1 )
-      LWKOPT = N*NB
+      LWKOPT = MAX( 1, N*NB )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/zgetsls.f b/lapack-netlib/SRC/zgetsls.f
index 17c6d5146..26311c611 100644
--- a/lapack-netlib/SRC/zgetsls.f
+++ b/lapack-netlib/SRC/zgetsls.f
@@ -127,7 +127,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed.
 *>          If LWORK = -1, the routine calculates optimal size of WORK for the
 *>          optimal performance and returns this value in WORK(1).
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEsolve
+*> \ingroup getsls
 *
 *  =====================================================================
       SUBROUTINE ZGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB,
@@ -192,7 +192,7 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       DOUBLE PRECISION   DLAMCH, ZLANGE
-      EXTERNAL           LSAME, DLABAD, DLAMCH, ZLANGE
+      EXTERNAL           LSAME, DLAMCH, ZLANGE
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ZGEQR, ZGEMQR, ZLASCL, ZLASET,
@@ -229,7 +229,10 @@
 *
 *     Determine the optimum and minimum LWORK
 *
-       IF( M.GE.N ) THEN
+       IF( MIN( M, N, NRHS ).EQ.0 ) THEN
+         WSIZEO = 1
+         WSIZEM = 1
+       ELSE IF( M.GE.N ) THEN
          CALL ZGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 )
          TSZO = INT( TQ( 1 ) )
          LWO  = INT( WORKQ( 1 ) )
@@ -297,7 +300,6 @@
 *
        SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' )
        BIGNUM = ONE / SMLNUM
-       CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A, B if max element outside range [SMLNUM,BIGNUM]
 *
diff --git a/lapack-netlib/SRC/zgetsqrhrt.f b/lapack-netlib/SRC/zgetsqrhrt.f
index 5f0167937..e7ce993aa 100644
--- a/lapack-netlib/SRC/zgetsqrhrt.f
+++ b/lapack-netlib/SRC/zgetsqrhrt.f
@@ -131,13 +131,15 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
-*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          If MIN(M,N) = 0, LWORK >= 1, else
+*>          LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
-*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ).
+*>
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
@@ -160,7 +162,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup comlpex16OTHERcomputational
+*> \ingroup getsqrhrt
 *
 *> \par Contributors:
 *  ==================
@@ -212,7 +214,7 @@
 *     Test the input arguments
 *
       INFO = 0
-      LQUERY  = LWORK.EQ.-1
+      LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
       ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
@@ -225,7 +227,7 @@
          INFO = -5
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -7
-      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN
          INFO = -9
       ELSE
 *
@@ -263,8 +265,9 @@
             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
             LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+            LWORKOPT = MAX( 1, LWORKOPT )
 *
-            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+            IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN
                INFO = -11
             END IF
 *
diff --git a/lapack-netlib/SRC/zgges3.f b/lapack-netlib/SRC/zgges3.f
index 8b3e44f88..8235c2543 100644
--- a/lapack-netlib/SRC/zgges3.f
+++ b/lapack-netlib/SRC/zgges3.f
@@ -215,7 +215,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= MAX(1,2*N)
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -260,7 +261,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEeigen
+*> \ingroup gges3
 *
 *  =====================================================================
       SUBROUTINE ZGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B,
@@ -300,7 +301,8 @@
       LOGICAL            CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL,
      $                   LQUERY, WANTST
       INTEGER            I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT,
-     $                   ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT
+     $                   ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT,
+     $                   LWKMIN
       DOUBLE PRECISION   ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL,
      $                   PVSR, SMLNUM
 *     ..
@@ -309,9 +311,8 @@
       DOUBLE PRECISION   DIF( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3,
-     $                   ZLAQZ0, ZLACPY, ZLASCL, ZLASET, ZTGSEN, ZUNGQR,
-     $                   ZUNMQR
+      EXTERNAL           XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, ZLAQZ0,
+     $                   ZLACPY, ZLASCL, ZLASET, ZTGSEN, ZUNGQR, ZUNMQR
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -353,6 +354,8 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N )
+*
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -369,7 +372,7 @@
          INFO = -14
       ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN
          INFO = -16
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -18
       END IF
 *
@@ -377,28 +380,32 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL ZGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( 1,  N + INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN,  N + INT( WORK( 1 ) ) )
          CALL ZUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
-         LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) )
          IF( ILVSL ) THEN
             CALL ZUNGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, IERR )
             LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) )
          END IF
          CALL ZGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL,
      $                LDVSL, VSR, LDVSR, WORK, -1, IERR )
-         LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) )
          CALL ZLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB,
      $                ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, WORK, -1,
      $                RWORK, 0, IERR )
-         LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
          IF( WANTST ) THEN
             CALL ZTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB,
      $                   ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, SDIM,
      $                   PVSL, PVSR, DIF, WORK, -1, IDUM, 1, IERR )
-            LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+         END IF
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = DCMPLX( LWKOPT )
          END IF
-         WORK( 1 ) = DCMPLX( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -420,7 +427,6 @@
       EPS = DLAMCH( 'P' )
       SMLNUM = DLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
diff --git a/lapack-netlib/SRC/zggev3.f b/lapack-netlib/SRC/zggev3.f
index 2d6c74582..0cc073470 100644
--- a/lapack-netlib/SRC/zggev3.f
+++ b/lapack-netlib/SRC/zggev3.f
@@ -174,7 +174,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= MAX(1,2*N).
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -208,7 +209,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEeigen
+*> \ingroup ggev3
 *
 *  =====================================================================
       SUBROUTINE ZGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA,
@@ -243,7 +244,7 @@
       CHARACTER          CHTEMP
       INTEGER            ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO,
      $                   IN, IRIGHT, IROWS, IRWRK, ITAU, IWRK, JC, JR,
-     $                   LWKOPT
+     $                   LWKMIN, LWKOPT
       DOUBLE PRECISION   ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS,
      $                   SMLNUM, TEMP
       COMPLEX*16         X
@@ -252,9 +253,8 @@
       LOGICAL            LDUMMA( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3,
-     $                   ZLAQZ0, ZLACPY, ZLASCL, ZLASET, ZTGEVC, ZUNGQR,
-     $                   ZUNMQR
+      EXTERNAL           XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, ZLAQZ0,
+     $                   ZLACPY, ZLASCL, ZLASET, ZTGEVC, ZUNGQR, ZUNMQR
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -301,6 +301,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N )
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -315,7 +316,7 @@
          INFO = -11
       ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN
          INFO = -13
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -15
       END IF
 *
@@ -323,7 +324,7 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL ZGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( 1,  N+INT( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, N+INT( WORK( 1 ) ) )
          CALL ZUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
          LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
@@ -348,7 +349,11 @@
      $                   RWORK, 0, IERR )
             LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
          END IF
-         WORK( 1 ) = DCMPLX( LWKOPT )
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = DCMPLX( LWKOPT )
+         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -368,7 +373,6 @@
       EPS = DLAMCH( 'E' )*DLAMCH( 'B' )
       SMLNUM = DLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
diff --git a/lapack-netlib/SRC/zgghd3.f b/lapack-netlib/SRC/zgghd3.f
index b29cdc70a..08343688d 100644
--- a/lapack-netlib/SRC/zgghd3.f
+++ b/lapack-netlib/SRC/zgghd3.f
@@ -176,14 +176,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension (LWORK)
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in]  LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= 1.
+*>          The length of the array WORK. LWORK >= 1.
 *>          For optimum performance LWORK >= 6*N*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -208,7 +208,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup gghd3
 *
 *> \par Further Details:
 *  =====================
@@ -275,7 +275,12 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'ZGGHD3', ' ', N, ILO, IHI, -1 )
-      LWKOPT = MAX( 6*N*NB, 1 )
+      NH = IHI - ILO + 1
+      IF( NH.LE.1 ) THEN
+         LWKOPT = 1
+      ELSE
+         LWKOPT = 6*N*NB
+      END IF
       WORK( 1 ) = DCMPLX( LWKOPT )
       INITQ = LSAME( COMPQ, 'I' )
       WANTQ = INITQ .OR. LSAME( COMPQ, 'V' )
@@ -325,7 +330,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = CONE
          RETURN
@@ -883,6 +887,7 @@
       IF ( JCOL.LT.IHI )
      $   CALL ZGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q,
      $                LDQ, Z, LDZ, IERR )
+*
       WORK( 1 ) = DCMPLX( LWKOPT )
 *
       RETURN
diff --git a/lapack-netlib/SRC/zggqrf.f b/lapack-netlib/SRC/zggqrf.f
index 0388b0874..d8636d663 100644
--- a/lapack-netlib/SRC/zggqrf.f
+++ b/lapack-netlib/SRC/zggqrf.f
@@ -173,7 +173,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup ggqrf
 *
 *> \par Further Details:
 *  =====================
@@ -250,7 +250,7 @@
       NB2 = ILAENV( 1, 'ZGERQF', ' ', N, P, -1, -1 )
       NB3 = ILAENV( 1, 'ZUNMQR', ' ', N, M, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P )*NB
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/zggrqf.f b/lapack-netlib/SRC/zggrqf.f
index be912c772..69c14af24 100644
--- a/lapack-netlib/SRC/zggrqf.f
+++ b/lapack-netlib/SRC/zggrqf.f
@@ -172,7 +172,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup ggrqf
 *
 *> \par Further Details:
 *  =====================
@@ -249,7 +249,7 @@
       NB2 = ILAENV( 1, 'ZGEQRF', ' ', P, N, -1, -1 )
       NB3 = ILAENV( 1, 'ZUNMRQ', ' ', M, N, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P )*NB
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/zggsvd3.f b/lapack-netlib/SRC/zggsvd3.f
index 71257a7c0..40624f5be 100644
--- a/lapack-netlib/SRC/zggsvd3.f
+++ b/lapack-netlib/SRC/zggsvd3.f
@@ -277,7 +277,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -332,7 +332,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEsing
+*> \ingroup ggsvd3
 *
 *> \par Contributors:
 *  ==================
diff --git a/lapack-netlib/SRC/zggsvp3.f b/lapack-netlib/SRC/zggsvp3.f
index f39ccdad3..7b465aaee 100644
--- a/lapack-netlib/SRC/zggsvp3.f
+++ b/lapack-netlib/SRC/zggsvp3.f
@@ -233,7 +233,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -256,7 +256,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup ggsvp3
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/zheevd.f b/lapack-netlib/SRC/zheevd.f
index ba52f9e72..8e86b9e88 100644
--- a/lapack-netlib/SRC/zheevd.f
+++ b/lapack-netlib/SRC/zheevd.f
@@ -116,8 +116,7 @@
 *>
 *> \param[out] RWORK
 *> \verbatim
-*>          RWORK is DOUBLE PRECISION array,
-*>                                         dimension (LRWORK)
+*>          RWORK is DOUBLE PRECISION array, dimension (MAX(1,LRWORK))
 *>          On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
 *> \endverbatim
 *>
@@ -180,7 +179,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEeigen
+*> \ingroup heevd
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/zheevr.f b/lapack-netlib/SRC/zheevr.f
index 1452e04a3..fe6e1a85f 100644
--- a/lapack-netlib/SRC/zheevr.f
+++ b/lapack-netlib/SRC/zheevr.f
@@ -272,7 +272,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= max(1,2*N).
+*>          The length of the array WORK.
+*>          If N <= 1, LWORK >= 1, else LWORK >= 2*N.
 *>          For optimal efficiency, LWORK >= (NB+1)*N,
 *>          where NB is the max of the blocksize for ZHETRD and for
 *>          ZUNMTR as returned by ILAENV.
@@ -294,7 +295,8 @@
 *> \param[in] LRWORK
 *> \verbatim
 *>          LRWORK is INTEGER
-*>          The length of the array RWORK.  LRWORK >= max(1,24*N).
+*>          The length of the array RWORK.
+*>          If N <= 1, LRWORK >= 1, else LRWORK >= 24*N.
 *>
 *>          If LRWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -313,7 +315,8 @@
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -338,7 +341,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEeigen
+*> \ingroup heevr
 *
 *> \par Contributors:
 *  ==================
@@ -417,9 +420,15 @@
       LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR.
      $         ( LIWORK.EQ.-1 ) )
 *
-      LRWMIN = MAX( 1, 24*N )
-      LIWMIN = MAX( 1, 10*N )
-      LWMIN = MAX( 1, 2*N )
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LRWMIN = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = 2*N
+         LRWMIN = 24*N
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN
@@ -454,7 +463,7 @@
          NB = ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 )
          NB = MAX( NB, ILAENV( 1, 'ZUNMTR', UPLO, N, -1, -1, -1 ) )
          LWKOPT = MAX( ( NB+1 )*N, LWMIN )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 )  = LWKOPT
          RWORK( 1 ) = LRWMIN
          IWORK( 1 ) = LIWMIN
 *
@@ -483,7 +492,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 2
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = DBLE( A( 1, 1 ) )
@@ -710,7 +719,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 )  = LWKOPT
       RWORK( 1 ) = LRWMIN
       IWORK( 1 ) = LIWMIN
 *
diff --git a/lapack-netlib/SRC/zheevr_2stage.f b/lapack-netlib/SRC/zheevr_2stage.f
index 5c576e633..b1cc7175f 100644
--- a/lapack-netlib/SRC/zheevr_2stage.f
+++ b/lapack-netlib/SRC/zheevr_2stage.f
@@ -265,7 +265,7 @@
 *>          indicating the nonzero elements in Z. The i-th eigenvector
 *>          is nonzero only in elements ISUPPZ( 2*i-1 ) through
 *>          ISUPPZ( 2*i ). This is an output of ZSTEMR (tridiagonal
-*>          matrix). The support of the eigenvectors of A is typically 
+*>          matrix). The support of the eigenvectors of A is typically
 *>          1:N because of the unitary transformations applied by ZUNMTR.
 *>          Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
 *> \endverbatim
@@ -279,12 +279,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  
+*>          The dimension of the array WORK.
+*>          If N <= 1,               LWORK must be at least 1.
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 26*N, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -310,7 +311,8 @@
 *> \param[in] LRWORK
 *> \verbatim
 *>          LRWORK is INTEGER
-*>          The length of the array RWORK.  LRWORK >= max(1,24*N).
+*>          The length of the array RWORK.
+*>          If N <= 1, LRWORK >= 1, else LRWORK >= 24*N.
 *>
 *>          If LRWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -329,7 +331,8 @@
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -354,7 +357,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEeigen
+*> \ingroup heevr_2stage
 *
 *> \par Contributors:
 *  ==================
@@ -382,7 +385,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -390,11 +393,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -472,9 +475,16 @@
       IB     = ILAENV2STAGE( 2, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
       LHTRD  = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
       LWTRD  = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
-      LWMIN  = N + LHTRD + LWTRD
-      LRWMIN = MAX( 1, 24*N )
-      LIWMIN = MAX( 1, 10*N )
+*
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LRWMIN = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = N + LHTRD + LWTRD
+         LRWMIN = 24*N
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
@@ -535,7 +545,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 2
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = DBLE( A( 1, 1 ) )
@@ -643,9 +653,9 @@
 *
 *     Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
 *
-      CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), 
+      CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ),
      $                    RWORK( INDRE ), WORK( INDTAU ),
-     $                    WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWK ), LLWORK, IINFO )
 *
 *     If all eigenvalues are desired
diff --git a/lapack-netlib/SRC/zhesv_aa.f b/lapack-netlib/SRC/zhesv_aa.f
index df8498c7a..b3d4b3725 100644
--- a/lapack-netlib/SRC/zhesv_aa.f
+++ b/lapack-netlib/SRC/zhesv_aa.f
@@ -128,7 +128,7 @@
 *>          LWORK is INTEGER
 *>          The length of WORK.  LWORK >= MAX(1,2*N,3*N-2), and for best 
 *>          performance LWORK >= max(1,N*NB), where NB is the optimal
-*>          blocksize for ZHETRF.
+*>          blocksize for ZHETRF_AA.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEsolve
+*> \ingroup hesv_aa
 *
 *  =====================================================================
       SUBROUTINE ZHESV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
@@ -177,7 +177,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS
+      INTEGER            LWKMIN, LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -196,6 +196,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N, 3*N-2 )
       IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -206,17 +207,17 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-         LWKOPT_HETRF = INT( WORK(1) )
+         LWKOPT_HETRF = INT( WORK( 1 ) )
          CALL ZHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
      $                   -1, INFO )
-         LWKOPT_HETRS = INT( WORK(1) )
-         LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS )
+         LWKOPT_HETRS = INT( WORK( 1 ) )
+         LWKOPT = MAX( LWKMIN, LWKOPT_HETRF, LWKOPT_HETRS )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/zhesv_aa_2stage.f b/lapack-netlib/SRC/zhesv_aa_2stage.f
index 79c01c546..c503b5554 100644
--- a/lapack-netlib/SRC/zhesv_aa_2stage.f
+++ b/lapack-netlib/SRC/zhesv_aa_2stage.f
@@ -100,14 +100,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is COMPLEX*16 array, dimension (LTB)
+*>          TB is COMPLEX*16 array, dimension (MAX(1,LTB)).
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -147,14 +147,15 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 workspace of size LWORK
+*>          WORK is COMPLEX*16 workspace of size (MAX(1,LWORK)).
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used to
+*>          select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -178,7 +179,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEsolve
+*> \ingroup hesv_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE ZHESV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB,
@@ -208,7 +209,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            UPPER, TQUERY, WQUERY
-      INTEGER            LWKOPT
+      INTEGER            LWKOPT, LWKMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -229,6 +230,7 @@
       UPPER = LSAME( UPLO, 'U' )
       WQUERY = ( LWORK.EQ.-1 )
       TQUERY = ( LTB.EQ.-1 )
+      LWKMIN = MAX( 1, N )
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -237,18 +239,19 @@
          INFO = -3
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -5
-      ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -7
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -11
-      ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN
          INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV,
      $                          IPIV2, WORK, -1, INFO )
-         LWKOPT = INT( WORK(1) )
+         LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) )
+         WORK( 1 ) = LWKOPT
       END IF
 *
       IF( INFO.NE.0 ) THEN
diff --git a/lapack-netlib/SRC/zhesvx.f b/lapack-netlib/SRC/zhesvx.f
index 485c81df6..64aa16674 100644
--- a/lapack-netlib/SRC/zhesvx.f
+++ b/lapack-netlib/SRC/zhesvx.f
@@ -234,8 +234,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >= max(1,2*N), and for best
-*>          performance, when FACT = 'N', LWORK >= max(1,2*N,N*NB), where
+*>          The length of WORK.  LWORK >= MAX(1,2*N), and for best
+*>          performance, when FACT = 'N', LWORK >= MAX(1,2*N,N*NB), where
 *>          NB is the optimal blocksize for ZHETRF.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
@@ -276,7 +276,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEsolve
+*> \ingroup hesvx
 *
 *  =====================================================================
       SUBROUTINE ZHESVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B,
@@ -307,7 +307,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, NOFACT
-      INTEGER            LWKOPT, NB
+      INTEGER            LWKOPT, LWKMIN, NB
       DOUBLE PRECISION   ANORM
 *     ..
 *     .. External Functions ..
@@ -329,6 +329,7 @@
       INFO = 0
       NOFACT = LSAME( FACT, 'N' )
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N )
       IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN
          INFO = -1
       ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) )
@@ -346,12 +347,12 @@
          INFO = -11
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -13
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -18
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = MAX( 1, 2*N )
+         LWKOPT = LWKMIN
          IF( NOFACT ) THEN
             NB = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 )
             LWKOPT = MAX( LWKOPT, N*NB )
diff --git a/lapack-netlib/SRC/zhetrd_2stage.f b/lapack-netlib/SRC/zhetrd_2stage.f
index b9d2f0eb1..ab444894b 100644
--- a/lapack-netlib/SRC/zhetrd_2stage.f
+++ b/lapack-netlib/SRC/zhetrd_2stage.f
@@ -4,23 +4,23 @@
 *
 *  =========== DOCUMENTATION ===========
 *
-* Online html documentation available at 
-*            http://www.netlib.org/lapack/explore-html/ 
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
-*> Download ZHETRD_2STAGE + dependencies 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhetrd_2stage.f"> 
-*> [TGZ]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhetrd_2stage.f"> 
-*> [ZIP]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhetrd_2stage.f"> 
+*> Download ZHETRD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhetrd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhetrd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhetrd_2stage.f">
 *> [TXT]</a>
-*> \endhtmlonly 
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+*       SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
 *                                 HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
 *       IMPLICIT NONE
@@ -34,7 +34,7 @@
 *       COMPLEX*16         A( LDA, * ), TAU( * ),
 *                          HOUS2( * ), WORK( * )
 *       ..
-*  
+*
 *
 *> \par Purpose:
 *  =============
@@ -52,11 +52,11 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  in particular for the second stage (Band to
 *>                  tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate Q1 Q2 or to apply Q1 Q2, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate Q1 Q2 or to apply Q1 Q2,
 *>                  then LHOUS2 is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -86,7 +86,7 @@
 *>          triangular part of A is not referenced.
 *>          On exit, if UPLO = 'U', the band superdiagonal
 *>          of A are overwritten by the corresponding elements of the
-*>          internal band-diagonal matrix AB, and the elements above 
+*>          internal band-diagonal matrix AB, and the elements above
 *>          the KD superdiagonal, with the array TAU, represent the unitary
 *>          matrix Q1 as a product of elementary reflectors; if UPLO
 *>          = 'L', the diagonal and band subdiagonal of A are over-
@@ -117,13 +117,13 @@
 *> \param[out] TAU
 *> \verbatim
 *>          TAU is COMPLEX*16 array, dimension (N-KD)
-*>          The scalar factors of the elementary reflectors of 
+*>          The scalar factors of the elementary reflectors of
 *>          the first stage (see Further Details).
 *> \endverbatim
 *>
 *> \param[out] HOUS2
 *> \verbatim
-*>          HOUS2 is COMPLEX*16 array, dimension (LHOUS2)
+*>          HOUS2 is COMPLEX*16 array, dimension (MAX(1,LHOUS2))
 *>          Stores the Householder representation of the stage2
 *>          band to tridiagonal.
 *> \endverbatim
@@ -132,6 +132,8 @@
 *> \verbatim
 *>          LHOUS2 is INTEGER
 *>          The dimension of the array HOUS2.
+*>          LHOUS2 >= 1.
+*>
 *>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS2 array, returns
@@ -143,23 +145,26 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension (LWORK)
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS2=-1,
+*>          The dimension of the array WORK.
+*>          If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *>          LWORK = MAX(1, dimension) where
 *>          dimension   = max(stage1,stage2) + (KD+1)*N
-*>                      = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                        + max(2*KD*KD, KD*NTHREADS) 
-*>                        + (KD+1)*N 
+*>                      = N*KD + N*max(KD+1,FACTOPTNB)
+*>                        + max(2*KD*KD, KD*NTHREADS)
+*>                        + (KD+1)*N
 *>          where KD is the blocking size of the reduction,
 *>          FACTOPTNB is the blocking used by the QR or LQ
 *>          algorithm, usually FACTOPTNB=128 is a good choice
@@ -177,12 +182,12 @@
 *  Authors:
 *  ========
 *
-*> \author Univ. of Tennessee 
-*> \author Univ. of California Berkeley 
-*> \author Univ. of Colorado Denver 
-*> \author NAG Ltd. 
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrd_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -202,7 +207,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -210,16 +215,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+      SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
      $                          HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
       IMPLICIT NONE
@@ -265,10 +270,13 @@
 *
       KD     = ILAENV2STAGE( 1, 'ZHETRD_2STAGE', VECT, N, -1, -1, -1 )
       IB     = ILAENV2STAGE( 2, 'ZHETRD_2STAGE', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 )
-*      WRITE(*,*),'ZHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
-*     $            LHMIN, LWMIN
+      IF( N.EQ.0 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.LSAME( VECT, 'N' ) ) THEN
          INFO = -1
@@ -309,14 +317,14 @@
       LWRK  = LWORK-LDAB*N
       ABPOS = 1
       WPOS  = ABPOS + LDAB*N
-      CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, 
+      CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB,
      $                   TAU, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZHETRD_HE2HB', -INFO )
          RETURN
       END IF
-      CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, 
-     $                   WORK( ABPOS ), LDAB, D, E, 
+      CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD,
+     $                   WORK( ABPOS ), LDAB, D, E,
      $                   HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZHETRD_HB2ST', -INFO )
@@ -324,7 +332,6 @@
       END IF
 *
 *
-      HOUS2( 1 ) = LHMIN
       WORK( 1 )  = LWMIN
       RETURN
 *
diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F
index 1d39ac942..247497ab6 100644
--- a/lapack-netlib/SRC/zhetrd_hb2st.F
+++ b/lapack-netlib/SRC/zhetrd_hb2st.F
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+*       SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
 *                               D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 *       #if defined(_OPENMP)
@@ -53,12 +53,12 @@
 *> \param[in] STAGE1
 *> \verbatim
 *>          STAGE1 is CHARACTER*1
-*>          = 'N':  "No": to mention that the stage 1 of the reduction  
+*>          = 'N':  "No": to mention that the stage 1 of the reduction
 *>                  from dense to band using the zhetrd_he2hb routine
-*>                  was not called before this routine to reproduce AB. 
-*>                  In other term this routine is called as standalone. 
-*>          = 'Y':  "Yes": to mention that the stage 1 of the 
-*>                  reduction from dense to band using the zhetrd_he2hb 
+*>                  was not called before this routine to reproduce AB.
+*>                  In other term this routine is called as standalone.
+*>          = 'Y':  "Yes": to mention that the stage 1 of the
+*>                  reduction from dense to band using the zhetrd_he2hb
 *>                  routine has been called to produce AB (e.g., AB is
 *>                  the output of zhetrd_he2hb.
 *> \endverbatim
@@ -66,10 +66,10 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  and thus LHOUS is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate or to apply Q later on, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate or to apply Q later on,
 *>                  then LHOUS is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -132,34 +132,39 @@
 *>
 *> \param[out] HOUS
 *> \verbatim
-*>          HOUS is COMPLEX*16 array, dimension LHOUS, that
-*>          store the Householder representation.
+*>          HOUS is COMPLEX*16 array, dimension (MAX(1,LHOUS))
+*>          Stores the Householder representation.
 *> \endverbatim
 *>
 *> \param[in] LHOUS
 *> \verbatim
 *>          LHOUS is INTEGER
-*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array HOUS.
+*>          If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS array, returns
 *>          this value as the first entry of the HOUS array, and no error
 *>          message related to LHOUS is issued by XERBLA.
 *>          LHOUS = MAX(1, dimension) where
 *>          dimension = 4*N if VECT='N'
-*>          not available now if VECT='H'     
+*>          not available now if VECT='H'
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension LWORK.
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)).
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array WORK.
+*>          If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -188,7 +193,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup hetrd_hb2st
 *
 *> \par Further Details:
 *  =====================
@@ -208,7 +213,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -216,16 +221,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+      SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
      $                         D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 *
@@ -259,11 +264,11 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1
-      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, 
+      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
      $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
      $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
-     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
-     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, 
+     $                   NBTILES, TTYPE, TID, NTHREADS,
+     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
      $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
      $                   SIZEV, SIZETAU, LDV, LHMIN, LWMIN
       DOUBLE PRECISION   ABSTMP
@@ -277,7 +282,7 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      INTEGER            ILAENV2STAGE 
+      INTEGER            ILAENV2STAGE
       EXTERNAL           LSAME, ILAENV2STAGE
 *     ..
 *     .. Executable Statements ..
@@ -285,7 +290,6 @@
 *     Determine the minimal workspace size required.
 *     Test the input parameters
 *
-      DEBUG   = 0
       INFO    = 0
       AFTERS1 = LSAME( STAGE1, 'Y' )
       WANTQ   = LSAME( VECT, 'V' )
@@ -294,9 +298,14 @@
 *
 *     Determine the block size, the workspace size and the hous size.
 *
-      IB     = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
+      IB       = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 )
+      IF( N.EQ.0 .OR. KD.LE.1 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
          INFO = -1
@@ -358,7 +367,7 @@
           ABDPOS   = KD + 1
           ABOFDPOS = KD
       ELSE
-          APOS     = INDA 
+          APOS     = INDA
           AWPOS    = INDA + KD + 1
           DPOS     = APOS
           OFDPOS   = DPOS + 1
@@ -366,11 +375,11 @@
           ABOFDPOS = 2
 
       ENDIF
-*      
-*     Case KD=0: 
-*     The matrix is diagonal. We just copy it (convert to "real" for 
-*     complex because D is double and the imaginary part should be 0) 
-*     and store it in D. A sequential code here is better or 
+*
+*     Case KD=0:
+*     The matrix is diagonal. We just copy it (convert to "real" for
+*     complex because D is double and the imaginary part should be 0)
+*     and store it in D. A sequential code here is better or
 *     in a parallel environment it might need two cores for D and E
 *
       IF( KD.EQ.0 ) THEN
@@ -385,17 +394,17 @@
           WORK( 1 ) = 1
           RETURN
       END IF
-*      
-*     Case KD=1: 
-*     The matrix is already Tridiagonal. We have to make diagonal 
+*
+*     Case KD=1:
+*     The matrix is already Tridiagonal. We have to make diagonal
 *     and offdiagonal elements real, and store them in D and E.
-*     For that, for real precision just copy the diag and offdiag 
-*     to D and E while for the COMPLEX case the bulge chasing is  
-*     performed to convert the hermetian tridiagonal to symmetric 
-*     tridiagonal. A simpler conversion formula might be used, but then 
+*     For that, for real precision just copy the diag and offdiag
+*     to D and E while for the COMPLEX case the bulge chasing is
+*     performed to convert the hermetian tridiagonal to symmetric
+*     tridiagonal. A simpler conversion formula might be used, but then
 *     updating the Q matrix will be required and based if Q is generated
-*     or not this might complicate the story. 
-*      
+*     or not this might complicate the story.
+*
       IF( KD.EQ.1 ) THEN
           DO 50 I = 1, N
               D( I ) = DBLE( AB( ABDPOS, I ) )
@@ -444,7 +453,7 @@ C                 END IF
           RETURN
       END IF
 *
-*     Main code start here. 
+*     Main code start here.
 *     Reduce the hermitian band of A to a tridiagonal matrix.
 *
       THGRSIZ   = N
@@ -453,7 +462,7 @@ C                 END IF
       NBTILES   = CEILING( REAL(N)/REAL(KD) )
       STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
       THGRNB    = CEILING( REAL(N-1)/REAL(THGRSIZ) )
-*      
+*
       CALL ZLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
       CALL ZLASET( "A", KD,   N, ZERO, ZERO, WORK( AWPOS ), LDA )
 *
@@ -462,7 +471,7 @@ C                 END IF
 *
 #if defined(_OPENMP)
 !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
-!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) 
+!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
 !$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
 !$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
 !$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
@@ -471,7 +480,7 @@ C                 END IF
 #endif
 *
 *     main bulge chasing loop
-*      
+*
       DO 100 THGRID = 1, THGRNB
           STT  = (THGRID-1)*THGRSIZ+1
           THED = MIN( (STT + THGRSIZ -1), (N-1))
@@ -482,7 +491,7 @@ C                 END IF
                   ST = STT
                   DO 130 SWEEPID = ST, ED
                       DO 140 K = 1, GRSIZ
-                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ) 
+                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ)
      $                           + (M-1)*GRSIZ + K
                           IF ( MYID.EQ.1 ) THEN
                               TTYPE = 1
@@ -508,17 +517,17 @@ C                 END IF
                           ENDIF
 *
 *                         Call the kernel
-*                             
+*
 #if defined(_OPENMP) &&  _OPENMP >= 201307
 
-                          IF( TTYPE.NE.1 ) THEN      
+                          IF( TTYPE.NE.1 ) THEN
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(in:WORK(MYID-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
@@ -526,20 +535,20 @@ C                 END IF
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
                           ENDIF
 #else
-                          CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                          CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                         STIND, EDIND, SWEEPID, N, KD, IB,
-     $                         WORK ( INDA ), LDA, 
+     $                         WORK ( INDA ), LDA,
      $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                         WORK( INDW ) )
-#endif 
+#endif
                           IF ( BLKLASTIND.GE.(N-1) ) THEN
                               STT = STT + 1
                               EXIT
@@ -554,14 +563,14 @@ C                 END IF
 !$OMP END MASTER
 !$OMP END PARALLEL
 #endif
-*      
+*
 *     Copy the diagonal from A to D. Note that D is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
       DO 150 I = 1, N
           D( I ) = DBLE( WORK( DPOS+(I-1)*LDA ) )
   150 CONTINUE
-*      
+*
 *     Copy the off diagonal from A to E. Note that E is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
@@ -575,11 +584,10 @@ C                 END IF
   170     CONTINUE
       ENDIF
 *
-      HOUS( 1 ) = LHMIN
       WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of ZHETRD_HB2ST
 *
       END
-      
+
diff --git a/lapack-netlib/SRC/zhetrd_he2hb.f b/lapack-netlib/SRC/zhetrd_he2hb.f
index e1b2e1794..3e3bfa374 100644
--- a/lapack-netlib/SRC/zhetrd_he2hb.f
+++ b/lapack-netlib/SRC/zhetrd_he2hb.f
@@ -123,8 +123,8 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension (LWORK)
-*>          On exit, if INFO = 0, or if LWORK=-1, 
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, or if LWORK = -1,
 *>          WORK(1) returns the size of LWORK.
 *> \endverbatim
 *>
@@ -132,7 +132,9 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK which should be calculated
-*>          by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*>          by a workspace query.
+*>          If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY).
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -158,7 +160,7 @@
 *> \author Univ. of Colorado Denver 
 *> \author NAG Ltd. 
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrd_he2hb
 *
 *> \par Further Details:
 *  =====================
@@ -293,8 +295,12 @@
       INFO   = 0
       UPPER  = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
-      LWMIN  = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 )
-      
+      IF( N.LE.KD+1 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 )
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/zhetrf.f b/lapack-netlib/SRC/zhetrf.f
index 78d4f71b8..433887108 100644
--- a/lapack-netlib/SRC/zhetrf.f
+++ b/lapack-netlib/SRC/zhetrf.f
@@ -107,7 +107,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK. LWORK >= 1. For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *> \endverbatim
 *>
@@ -130,7 +130,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrf
 *
 *> \par Further Details:
 *  =====================
@@ -227,7 +227,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
@@ -346,6 +346,7 @@
       END IF
 *
    40 CONTINUE
+*
       WORK( 1 ) = LWKOPT
       RETURN
 *
diff --git a/lapack-netlib/SRC/zhetrf_aa.f b/lapack-netlib/SRC/zhetrf_aa.f
index 537c16e8c..381c87d51 100644
--- a/lapack-netlib/SRC/zhetrf_aa.f
+++ b/lapack-netlib/SRC/zhetrf_aa.f
@@ -101,8 +101,10 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK. LWORK >= MAX(1,2*N). For optimum performance
-*>          LWORK >= N*(1+NB), where NB is the optimal blocksize.
+*>          The length of WORK.
+*>          LWORK >= 1, if N >= 1, and LWORK >= 2*N, otherwise.
+*>          For optimum performance LWORK >= N*(1+NB), where NB is
+*>          the optimal blocksize, returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -125,10 +127,10 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrf_aa
 *
 *  =====================================================================
-      SUBROUTINE ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO)
+      SUBROUTINE ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -152,7 +154,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL      LQUERY, UPPER
-      INTEGER      J, LWKOPT
+      INTEGER      J, LWKMIN, LWKOPT
       INTEGER      NB, MJ, NJ, K1, K2, J1, J2, J3, JB
       COMPLEX*16   ALPHA
 *     ..
@@ -178,18 +180,25 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( N.LE.1 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = 2*N
+         LWKOPT = (NB+1)*N
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = (NB+1)*N
          WORK( 1 ) = LWKOPT
       END IF
 *
@@ -202,11 +211,11 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
           RETURN
       ENDIF
       IPIV( 1 ) = 1
-      IF ( N.EQ.1 ) THEN
+      IF( N.EQ.1 ) THEN
          A( 1, 1 ) = DBLE( A( 1, 1 ) )
          RETURN
       END IF
diff --git a/lapack-netlib/SRC/zhetrf_aa_2stage.f b/lapack-netlib/SRC/zhetrf_aa_2stage.f
index 477602b5e..bab13a99d 100644
--- a/lapack-netlib/SRC/zhetrf_aa_2stage.f
+++ b/lapack-netlib/SRC/zhetrf_aa_2stage.f
@@ -87,14 +87,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is COMPLEX*16 array, dimension (LTB)
+*>          TB is COMPLEX*16 array, dimension (MAX(1,LTB))
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -121,14 +121,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 workspace of size LWORK
+*>          WORK is COMPLEX*16 workspace of size (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used to
+*>          select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16SYcomputational
+*> \ingroup hetrf_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV,
@@ -182,7 +182,7 @@
 *     .. Local Scalars ..
       LOGICAL            UPPER, TQUERY, WQUERY
       INTEGER            I, J, K, I1, I2, TD
-      INTEGER            LDTB, NB, KB, JB, NT, IINFO
+      INTEGER            LWKOPT, LDTB, NB, KB, JB, NT, IINFO
       COMPLEX*16         PIV
 *     ..
 *     .. External Functions ..
@@ -212,9 +212,9 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -6
-      ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN
          INFO = -10
       END IF
 *
@@ -228,10 +228,10 @@
       NB = ILAENV( 1, 'ZHETRF_AA_2STAGE', UPLO, N, -1, -1, -1 )
       IF( INFO.EQ.0 ) THEN
          IF( TQUERY ) THEN
-            TB( 1 ) = (3*NB+1)*N
+            TB( 1 ) = MAX( 1, (3*NB+1)*N )
          END IF
          IF( WQUERY ) THEN
-            WORK( 1 ) = N*NB
+            WORK( 1 ) = MAX( 1, N*NB )
          END IF
       END IF
       IF( TQUERY .OR. WQUERY ) THEN
@@ -240,7 +240,7 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
          RETURN
       ENDIF
 *
@@ -392,7 +392,7 @@
                CALL ZGETRF( N-(J+1)*NB, NB, 
      $                      WORK, N,
      $                      IPIV( (J+1)*NB+1 ), IINFO )
-c               IF (IINFO.NE.0 .AND. INFO.EQ.0) THEN
+c               IF( IINFO.NE.0 .AND. INFO.EQ.0 ) THEN
 c                  INFO = IINFO+(J+1)*NB
 c               END IF
 *
@@ -587,7 +587,7 @@ c               END IF
                CALL ZGETRF( N-(J+1)*NB, NB, 
      $                      A( (J+1)*NB+1, J*NB+1 ), LDA,
      $                      IPIV( (J+1)*NB+1 ), IINFO )
-c               IF (IINFO.NE.0 .AND. INFO.EQ.0) THEN
+c               IF( IINFO.NE.0 .AND. INFO.EQ.0 ) THEN
 c                  INFO = IINFO+(J+1)*NB
 c               END IF
 *         
diff --git a/lapack-netlib/SRC/zhetrf_rk.f b/lapack-netlib/SRC/zhetrf_rk.f
index 73dd9f9d0..7c505fa4d 100644
--- a/lapack-netlib/SRC/zhetrf_rk.f
+++ b/lapack-netlib/SRC/zhetrf_rk.f
@@ -177,14 +177,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension ( MAX(1,LWORK) ).
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned
 *>          by ILAENV.
 *>
@@ -229,7 +229,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrf_rk
 *
 *> \par Further Details:
 *  =====================
@@ -310,7 +310,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'ZHETRF_RK', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/zhetrf_rook.f b/lapack-netlib/SRC/zhetrf_rook.f
index e9de47248..a56349092 100644
--- a/lapack-netlib/SRC/zhetrf_rook.f
+++ b/lapack-netlib/SRC/zhetrf_rook.f
@@ -122,7 +122,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
@@ -150,7 +150,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrf_rook
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/zhetri2.f b/lapack-netlib/SRC/zhetri2.f
index 384745c3a..1d932b866 100644
--- a/lapack-netlib/SRC/zhetri2.f
+++ b/lapack-netlib/SRC/zhetri2.f
@@ -88,16 +88,16 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension (N+NB+1)*(NB+3)
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)).
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          WORK is size >= (N+NB+1)*(NB+3)
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>           calculates:
+*>          calculates:
 *>              - the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array,
 *>              - and no error message related to LWORK is issued by XERBLA.
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetri2
 *
 *  =====================================================================
       SUBROUTINE ZHETRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -159,9 +159,13 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+*
 *     Get blocksize
+*
       NBMAX = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 )
-      IF ( NBMAX .GE. N ) THEN
+      IF( N.EQ.0 ) THEN
+         MINSIZE = 1
+      ELSE IF( NBMAX.GE.N ) THEN
          MINSIZE = N
       ELSE
          MINSIZE = (N+NBMAX+1)*(NBMAX+3)
@@ -173,28 +177,29 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
-*
-*     Quick return if possible
-*
 *
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZHETRI2', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK(1)=MINSIZE
+         WORK( 1 ) = MINSIZE
          RETURN
       END IF
+*
+*     Quick return if possible
+*
       IF( N.EQ.0 )
      $   RETURN
 
-      IF( NBMAX .GE. N ) THEN
+      IF( NBMAX.GE.N ) THEN
          CALL ZHETRI( UPLO, N, A, LDA, IPIV, WORK, INFO )
       ELSE
          CALL ZHETRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO )
       END IF
+*
       RETURN
 *
 *     End of ZHETRI2
diff --git a/lapack-netlib/SRC/zhetrs_aa.f b/lapack-netlib/SRC/zhetrs_aa.f
index 06ac1fd28..b7a1f7f07 100644
--- a/lapack-netlib/SRC/zhetrs_aa.f
+++ b/lapack-netlib/SRC/zhetrs_aa.f
@@ -106,7 +106,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK >= max(1,3*N-2).
+*>          The dimension of the array WORK.
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the minimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -124,7 +130,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrs_aa
 *
 *  =====================================================================
       SUBROUTINE ZHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB,
@@ -152,7 +158,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, UPPER
-      INTEGER            K, KP, LWKOPT
+      INTEGER            K, KP, LWKMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -162,13 +168,19 @@
       EXTERNAL           ZGTSV, ZSWAP, ZTRSM, ZLACGV, ZLACPY, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX
+      INTRINSIC          MIN, MAX
 *     ..
 *     .. Executable Statements ..
 *
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWKMIN = 1
+      ELSE
+         LWKMIN = 3*N-2
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -179,21 +191,20 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZHETRS_AA', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         LWKOPT = (3*N-2)
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = LWKMIN
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 .OR. NRHS.EQ.0 )
+      IF( MIN( N, NRHS ).EQ.0 )
      $   RETURN
 *
       IF( UPPER ) THEN
diff --git a/lapack-netlib/SRC/zlamswlq.f b/lapack-netlib/SRC/zlamswlq.f
index 4abefa434..59a0a5558 100644
--- a/lapack-netlib/SRC/zlamswlq.f
+++ b/lapack-netlib/SRC/zlamswlq.f
@@ -127,17 +127,20 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          If SIDE = 'L', LWORK >= max(1,NB) * MB;
-*>          if SIDE = 'R', LWORK >= max(1,M) * MB.
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,NB*MB).
+*>          If SIDE = 'R', LWORK >= max(1,M*MB).
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -189,92 +192,103 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamswlq
+*>
 *  =====================================================================
       SUBROUTINE ZLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $    LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      COMPLEX*16        A( LDA, * ), WORK( * ), C(LDC, * ),
-     $      T( LDT, * )
+      COMPLEX*16         A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+*     ..
 *     .. External Subroutines ..
-      EXTERNAL    ZTPMLQT, ZGEMLQT, XERBLA
+      EXTERNAL           ZTPMLQT, ZGEMLQT, XERBLA
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      INFO = 0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * MB
       ELSE
         LW = M * MB
       END IF
 *
-      INFO = 0
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
+*
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( K.LT.0 ) THEN
         INFO = -5
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
         INFO = -4
-      ELSE IF( K.LT.MB .OR. MB.LT.1) THEN
+      ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN
         INFO = -6
       ELSE IF( LDA.LT.MAX( 1, K ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, MB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = LWMIN
+      END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'ZLAMSWLQ', -INFO )
-        WORK(1) = LW
         RETURN
-      ELSE IF (LQUERY) THEN
-        WORK(1) = LW
+      ELSE IF( LQUERY ) THEN
         RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
       IF((NB.LE.K).OR.(NB.GE.MAX(M,N,K))) THEN
         CALL ZGEMLQT( SIDE, TRANS, M, N, K, MB, A, LDA,
-     $        T, LDT, C, LDC, WORK, INFO)
+     $        T, LDT, C, LDC, WORK, INFO )
         RETURN
       END IF
 *
@@ -403,7 +417,7 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of ZLAMSWLQ
diff --git a/lapack-netlib/SRC/zlamtsqr.f b/lapack-netlib/SRC/zlamtsqr.f
index 5030cb75f..03770c06e 100644
--- a/lapack-netlib/SRC/zlamtsqr.f
+++ b/lapack-netlib/SRC/zlamtsqr.f
@@ -128,22 +128,24 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,N*NB).
+*>          If SIDE = 'R', LWORK >= max(1,MB*NB).
 *>
-*>          If SIDE = 'L', LWORK >= max(1,N)*NB;
-*>          if SIDE = 'R', LWORK >= max(1,MB)*NB.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -191,46 +193,50 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamtsqr
+*>
 *  =====================================================================
       SUBROUTINE ZLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $        LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      COMPLEX*16        A( LDA, * ), WORK( * ), C(LDC, * ),
-     $                T( LDT, * )
+      COMPLEX*16         A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR, Q
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, Q, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+*     ..
 *     .. External Subroutines ..
-      EXTERNAL   ZGEMQRT, ZTPMQRT, XERBLA
+      EXTERNAL           ZGEMQRT, ZTPMQRT, XERBLA
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      INFO = 0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * NB
         Q = M
       ELSE
@@ -238,11 +244,17 @@
         Q = N
       END IF
 *
-      INFO = 0
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
+*
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
@@ -253,38 +265,38 @@
         INFO = -7
       ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, NB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
-*     Determine the block size if it is tall skinny or short and wide
-*
-      IF( INFO.EQ.0)  THEN
-          WORK(1) = LW
+      IF( INFO.EQ.0 )  THEN
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'ZLAMTSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
+*
+*     Determine the block size if it is tall skinny or short and wide
 *
       IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN
         CALL ZGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA,
-     $        T, LDT, C, LDC, WORK, INFO)
+     $        T, LDT, C, LDC, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
       IF(LEFT.AND.NOTRAN) THEN
 *
@@ -410,7 +422,7 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of ZLAMTSQR
diff --git a/lapack-netlib/SRC/zlaqp2rk.c b/lapack-netlib/SRC/zlaqp2rk.c
new file mode 100644
index 000000000..0d38e71fb
--- /dev/null
+++ b/lapack-netlib/SRC/zlaqp2rk.c
@@ -0,0 +1,947 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+
+/* Subroutine */ int zlaqp2rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *kmax, doublereal *abstol, doublereal *reltol, 
+	integer *kp1, doublereal *maxc2nrm, doublecomplex *a, integer *lda, 
+	integer *k, doublereal *maxc2nrmk, doublereal *relmaxc2nrmk, integer *
+	jpiv, doublecomplex *tau, doublereal *vn1, doublereal *vn2, 
+	doublecomplex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublereal d__1;
+    doublecomplex z__1;
+
+    /* Local variables */
+    doublecomplex aikk;
+    doublereal temp, temp2;
+    integer i__, j;
+    doublereal tol3z;
+    integer jmaxc2nrm, itemp;
+    extern /* Subroutine */ int zlarf_(char *, integer *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *, doublecomplex *);
+    integer minmnfact;
+    extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *);
+    doublereal myhugeval;
+    integer minmnupdt;
+    extern doublereal dznrm2_(integer *, doublecomplex *, integer *);
+    integer kk;
+    extern doublereal dlamch_(char *);
+    integer kp;
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern logical disnan_(doublereal *);
+    extern /* Subroutine */ int zlarfg_(integer *, doublecomplex *, 
+	    doublecomplex *, integer *, doublecomplex *);
+    doublereal taunan;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/*     MINMNUPDT is the smallest dimension */
+/*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which */
+/*     contains the submatrices A(IOFFSET+1:M,1:N) and */
+/*     B(IOFFSET+1:M,1:NRHS) as column blocks. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *kmax = f2cmin(*kmax,minmnfact);
+    tol3z = sqrt(dlamch_("Epsilon"));
+    myhugeval = dlamch_("Overflow");
+
+/*     Compute the factorization, KK is the lomn loop index. */
+
+    i__1 = *kmax;
+    for (kk = 1; kk <= i__1; ++kk) {
+
+	i__ = *ioffset + kk;
+
+	if (i__ == 1) {
+
+/*           ============================================================ */
+
+/*           We are at the first column of the original whole matrix A, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+/*           ============================================================ */
+
+	} else {
+
+/*           ============================================================ */
+
+/*           Determine the pivot column in KK-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__2 = *n - kk + 1;
+	    kp = kk - 1 + idamax_(&i__2, &vn1[kk], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,KK:N) in step KK. */
+/*           RELMAXC2NRMK  will be computed later, after somecondition */
+/*           checks on MAXC2NRMK. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains NaN, and set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (disnan_(maxc2nrmk)) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*info = *k + kp;
+
+/*               Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*               Array TAU(K+1:MINMNFACT) is not set and contains */
+/*               undefined elements. */
+
+		return 0;
+	    }
+
+/*           ============================================================ */
+
+/*           Quick return, if the submatrix A(I:M,KK:N) is */
+/*           a zero matrix. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*maxc2nrmk == 0.) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*relmaxc2nrmk = 0.;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    i__3 = j;
+		    tau[i__3].r = 0., tau[i__3].i = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + kk - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL >= ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+/*              Set K, the number of factorized columns. */
+
+		*k = kk - 1;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    i__3 = j;
+		    tau[i__3].r = 0., tau[i__3].i = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,KK:N): */
+/*        1) swap the KK-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) copy the KK-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than KK in the next loop step.) */
+/*        3) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A, not the block A(1:M,1:N). */
+
+	if (kp != kk) {
+	    zswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1);
+	    vn1[kp] = vn1[kk];
+	    vn2[kp] = vn2[kk];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[kk];
+	    jpiv[kk] = itemp;
+	}
+
+/*        Generate elementary reflector H(KK) using the column A(I:M,KK), */
+/*        if the column has more than one element, otherwise */
+/*        the elementary reflector would be an identity matrix, */
+/*        and TAU(KK) = CZERO. */
+
+	if (i__ < *m) {
+	    i__2 = *m - i__ + 1;
+	    zlarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], &
+		    c__1, &tau[kk]);
+	} else {
+	    i__2 = kk;
+	    tau[i__2].r = 0., tau[i__2].i = 0.;
+	}
+
+/*        Check if TAU(KK) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(KK) for Inf, */
+/*        since ZLARFG cannot produce TAU(KK) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by ZLARFG can contain Inf, which requires */
+/*        TAU(KK) to contain NaN. Therefore, this case of generating Inf */
+/*        by ZLARFG is covered by checking TAU(KK) for NaN. */
+
+	i__2 = kk;
+	d__1 = tau[i__2].r;
+	if (disnan_(&d__1)) {
+	    i__2 = kk;
+	    taunan = tau[i__2].r;
+	} else /* if(complicated condition) */ {
+	    d__1 = d_imag(&tau[kk]);
+	    if (disnan_(&d__1)) {
+		taunan = d_imag(&tau[kk]);
+	    } else {
+		taunan = 0.;
+	    }
+	}
+
+	if (disnan_(&taunan)) {
+	    *k = kk - 1;
+	    *info = kk;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = taunan;
+	    *relmaxc2nrmk = taunan;
+
+/*           Array TAU(KK:MINMNFACT) is not set and contains */
+/*           undefined elements, except the first element TAU(KK) = NaN. */
+
+	    return 0;
+	}
+
+/*        Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left. */
+/*        ( If M >= N, then at KK = N there is no residual matrix, */
+/*         i.e. no columns of A to update, only columns of B. */
+/*         If M < N, then at KK = M-IOFFSET, I = M and we have a */
+/*         one-row residual matrix in A and the elementary */
+/*         reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update */
+/*         is needed for the residual matrix in A and the */
+/*         right-hand-side-matrix in B. */
+/*         Therefore, we update only if */
+/*         KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */
+/*         condition is satisfied, not only KK < N+NRHS ) */
+
+	if (kk < minmnupdt) {
+	    i__2 = i__ + kk * a_dim1;
+	    aikk.r = a[i__2].r, aikk.i = a[i__2].i;
+	    i__2 = i__ + kk * a_dim1;
+	    a[i__2].r = 1., a[i__2].i = 0.;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n + *nrhs - kk;
+	    d_cnjg(&z__1, &tau[kk]);
+	    zlarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &z__1, 
+		    &a[i__ + (kk + 1) * a_dim1], lda, &work[1]);
+	    i__2 = i__ + kk * a_dim1;
+	    a[i__2].r = aikk.r, a[i__2].i = aikk.i;
+	}
+
+	if (kk < minmnfact) {
+
+/*           Update the partial column 2-norms for the residual matrix, */
+/*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */
+/*           when KK < f2cmin(M-IOFFSET, N). */
+
+	    i__2 = *n;
+	    for (j = kk + 1; j <= i__2; ++j) {
+		if (vn1[j] != 0.) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+/* Computing 2nd power */
+		    d__1 = z_abs(&a[i__ + j * a_dim1]) / vn1[j];
+		    temp = 1. - d__1 * d__1;
+		    temp = f2cmax(temp,0.);
+/* Computing 2nd power */
+		    d__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (d__1 * d__1);
+		    if (temp2 <= tol3z) {
+
+/*                    Compute the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by explicitly computing it, */
+/*                    and store it in both partial 2-norm vector VN1 */
+/*                    and exact column 2-norm vector VN2. */
+
+			i__3 = *m - i__;
+			vn1[j] = dznrm2_(&i__3, &a[i__ + 1 + j * a_dim1], &
+				c__1);
+			vn2[j] = vn1[j];
+
+		    } else {
+
+/*                    Update the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by removing one */
+/*                    element A(I,J) and store it in partial */
+/*                    2-norm vector VN1. */
+
+			vn1[j] *= sqrt(temp);
+
+		    }
+		}
+	    }
+
+	}
+
+/*     End factorization loop */
+
+    }
+
+/*     If we reached this point, all colunms have been factorized, */
+/*     i.e. no condition was triggered to exit the routine. */
+/*     Set the number of factorized columns. */
+
+    *k = *kmax;
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */
+/*     we return. */
+
+    if (*k < minmnfact) {
+
+	i__1 = *n - *k;
+	jmaxc2nrm = *k + idamax_(&i__1, &vn1[*k + 1], &c__1);
+	*maxc2nrmk = vn1[jmaxc2nrm];
+
+	if (*k == 0) {
+	    *relmaxc2nrmk = 1.;
+	} else {
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+	}
+
+    } else {
+	*maxc2nrmk = 0.;
+	*relmaxc2nrmk = 0.;
+    }
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, set TAUs corresponding to the columns that were */
+/*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO. */
+
+    i__1 = minmnfact;
+    for (j = *k + 1; j <= i__1; ++j) {
+	i__2 = j;
+	tau[i__2].r = 0., tau[i__2].i = 0.;
+    }
+
+    return 0;
+
+/*     End of ZLAQP2RK */
+
+} /* zlaqp2rk_ */
+
diff --git a/lapack-netlib/SRC/zlaqp2rk.f b/lapack-netlib/SRC/zlaqp2rk.f
new file mode 100644
index 000000000..f6bf555c2
--- /dev/null
+++ b/lapack-netlib/SRC/zlaqp2rk.f
@@ -0,0 +1,726 @@
+*> \brief \b ZLAQP2RK computes truncated QR factorization with column pivoting of a complex matrix block using Level 2 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZLAQP2RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zlaqp2rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zlaqp2rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zlaqp2rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE ZLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+*     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+*     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+*     $                     INFO )
+*      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+*      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+*      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            JPIV( * )
+*      DOUBLE PRECISION   VN1( * ), VN2( * )
+*      COMPLEX*16         A( LDA, * ), TAU( * ), WORK( * )
+*     $
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLAQP2RK computes a truncated (rank K) or full rank Householder QR
+*> factorization with column pivoting of the complex matrix
+*> block A(IOFFSET+1:M,1:N) as
+*>
+*>   A * P(K) = Q(K) * R(K).
+*>
+*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N)
+*> is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides matrix block B
+*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**H * B.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M-IOFFSET,N), then this stopping
+*>                criterion is not used, factorize columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>             satisfied on input and the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The second factorization stopping criterion.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The third factorization stopping criterion.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine ZGEQP3RK. 1 <= KP1 <= N_orig_mat.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is DOUBLE PRECISION
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine ZGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:K) below
+*>             the diagonal together with the array TAU represent
+*>             the unitary matrix Q(K) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:K) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,K+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(K)**H.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX*16 array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (N-1)
+*>          Used in ZLARF subroutine to apply an elementary
+*>          reflector from the left.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp2rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+     $                     INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            JPIV( * )
+      DOUBLE PRECISION   VN1( * ), VN2( * )
+      COMPLEX*16         A( LDA, * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+      COMPLEX*16         CZERO, CONE
+      PARAMETER          ( CZERO = ( 0.0D+0, 0.0D+0 ),
+     $                   CONE = ( 1.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT,
+     $                   MINMNUPDT
+      DOUBLE PRECISION   HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z
+      COMPLEX*16         AIKK
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZLARF, ZLARFG, ZSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, DCONJG, DIMAG, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      INTEGER            IDAMAX
+      DOUBLE PRECISION   DLAMCH, DZNRM2
+      EXTERNAL           DISNAN, DLAMCH, IDAMAX, DZNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+*     MINMNUPDT is the smallest dimension
+*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which
+*     contains the submatrices A(IOFFSET+1:M,1:N) and
+*     B(IOFFSET+1:M,1:NRHS) as column blocks.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      KMAX = MIN( KMAX, MINMNFACT )
+      TOL3Z = SQRT( DLAMCH( 'Epsilon' ) )
+      HUGEVAL = DLAMCH( 'Overflow' )
+*
+*     Compute the factorization, KK is the lomn loop index.
+*
+      DO KK = 1, KMAX
+*
+         I = IOFFSET + KK
+*
+         IF( I.EQ.1 ) THEN
+*
+*           ============================================================
+*
+*           We are at the first column of the original whole matrix A,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+            KP = KP1
+*
+*           ============================================================
+*
+         ELSE
+*
+*           ============================================================
+*
+*           Determine the pivot column in KK-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( KK-1 ) + IDAMAX( N-KK+1, VN1( KK ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,KK:N) in step KK.
+*           RELMAXC2NRMK  will be computed later, after somecondition
+*           checks on MAXC2NRMK.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains NaN, and set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( DISNAN( MAXC2NRMK ) ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+                K = KK - 1
+                INFO = K + KP
+*
+*               Set RELMAXC2NRMK to NaN.
+*
+                RELMAXC2NRMK = MAXC2NRMK
+*
+*               Array TAU(K+1:MINMNFACT) is not set and contains
+*               undefined elements.
+*
+               RETURN
+            END IF
+*
+*           ============================================================
+*
+*           Quick return, if the submatrix A(I:M,KK:N) is
+*           a zero matrix.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+               K = KK - 1
+               RELMAXC2NRMK = ZERO
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + KK - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third stopping criteria.
+*           NOTE: There is no need to test for ABSTOL >= ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+*              Set K, the number of factorized columns.
+*
+               K = KK - 1
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,KK:N):
+*        1) swap the KK-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) copy the KK-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than KK in the next loop step.)
+*        3) Save the pivot interchange with the indices relative to the
+*           the original matrix A, not the block A(1:M,1:N).
+*
+         IF( KP.NE.KK ) THEN
+            CALL ZSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 )
+            VN1( KP ) = VN1( KK )
+            VN2( KP ) = VN2( KK )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( KK )
+            JPIV( KK ) = ITEMP
+         END IF
+*
+*        Generate elementary reflector H(KK) using the column A(I:M,KK),
+*        if the column has more than one element, otherwise
+*        the elementary reflector would be an identity matrix,
+*        and TAU(KK) = CZERO.
+*
+         IF( I.LT.M ) THEN
+            CALL ZLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1,
+     $                   TAU( KK ) )
+         ELSE
+            TAU( KK ) = CZERO
+         END IF
+*
+*        Check if TAU(KK) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(KK) for Inf,
+*        since ZLARFG cannot produce TAU(KK) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by ZLARFG can contain Inf, which requires
+*        TAU(KK) to contain NaN. Therefore, this case of generating Inf
+*        by ZLARFG is covered by checking TAU(KK) for NaN.
+*
+         IF( DISNAN( DBLE( TAU(KK) ) ) ) THEN
+            TAUNAN = DBLE( TAU(KK) )
+         ELSE IF( DISNAN( DIMAG( TAU(KK) ) ) ) THEN
+            TAUNAN = DIMAG( TAU(KK) )
+         ELSE
+            TAUNAN = ZERO
+         END IF
+*
+         IF( DISNAN( TAUNAN ) ) THEN
+            K = KK - 1
+            INFO = KK
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAUNAN
+            RELMAXC2NRMK = TAUNAN
+*
+*           Array TAU(KK:MINMNFACT) is not set and contains
+*           undefined elements, except the first element TAU(KK) = NaN.
+*
+            RETURN
+         END IF
+*
+*        Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left.
+*        ( If M >= N, then at KK = N there is no residual matrix,
+*         i.e. no columns of A to update, only columns of B.
+*         If M < N, then at KK = M-IOFFSET, I = M and we have a
+*         one-row residual matrix in A and the elementary
+*         reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update
+*         is needed for the residual matrix in A and the
+*         right-hand-side-matrix in B.
+*         Therefore, we update only if
+*         KK < MINMNUPDT = min(M-IOFFSET, N+NRHS)
+*         condition is satisfied, not only KK < N+NRHS )
+*
+         IF( KK.LT.MINMNUPDT ) THEN
+            AIKK = A( I, KK )
+            A( I, KK ) = CONE
+            CALL ZLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1,
+     $                  DCONJG( TAU( KK ) ), A( I, KK+1 ), LDA,
+     $                  WORK( 1 ) )
+            A( I, KK ) = AIKK
+         END IF
+*
+         IF( KK.LT.MINMNFACT ) THEN
+*
+*           Update the partial column 2-norms for the residual matrix,
+*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e.
+*           when KK < min(M-IOFFSET, N).
+*
+            DO J = KK + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2
+                  TEMP = MAX( TEMP, ZERO )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2 .LE. TOL3Z ) THEN
+*
+*                    Compute the column 2-norm for the partial
+*                    column A(I+1:M,J) by explicitly computing it,
+*                    and store it in both partial 2-norm vector VN1
+*                    and exact column 2-norm vector VN2.
+*
+                     VN1( J ) = DZNRM2( M-I, A( I+1, J ), 1 )
+                     VN2( J ) = VN1( J )
+*
+                  ELSE
+*
+*                    Update the column 2-norm for the partial
+*                    column A(I+1:M,J) by removing one
+*                    element A(I,J) and store it in partial
+*                    2-norm vector VN1.
+*
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+*
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*     End factorization loop
+*
+      END DO
+*
+*     If we reached this point, all colunms have been factorized,
+*     i.e. no condition was triggered to exit the routine.
+*     Set the number of factorized columns.
+*
+      K = KMAX
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before
+*     we return.
+*
+      IF( K.LT.MINMNFACT ) THEN
+*
+         JMAXC2NRM = K + IDAMAX( N-K, VN1( K+1 ), 1 )
+         MAXC2NRMK = VN1( JMAXC2NRM )
+*
+         IF( K.EQ.0 ) THEN
+            RELMAXC2NRMK = ONE
+         ELSE
+            RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+         END IF
+*
+      ELSE
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+      END IF
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, set TAUs corresponding to the columns that were
+*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO.
+*
+      DO J = K + 1, MINMNFACT
+         TAU( J ) = CZERO
+      END DO
+*
+      RETURN
+*
+*     End of ZLAQP2RK
+*
+      END
diff --git a/lapack-netlib/SRC/zlaqp3rk.c b/lapack-netlib/SRC/zlaqp3rk.c
new file mode 100644
index 000000000..cb44e4d34
--- /dev/null
+++ b/lapack-netlib/SRC/zlaqp3rk.c
@@ -0,0 +1,1157 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {0.,0.};
+static doublecomplex c_b2 = {1.,0.};
+static integer c__1 = 1;
+
+/* Subroutine */ int zlaqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *nb, doublereal *abstol, doublereal *reltol, 
+	integer *kp1, doublereal *maxc2nrm, doublecomplex *a, integer *lda, 
+	logical *done, integer *kb, doublereal *maxc2nrmk, doublereal *
+	relmaxc2nrmk, integer *jpiv, doublecomplex *tau, doublereal *vn1, 
+	doublereal *vn2, doublecomplex *auxv, doublecomplex *f, integer *ldf, 
+	integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, f_dim1, f_offset, i__1, i__2, i__3;
+    doublereal d__1, d__2;
+    doublecomplex z__1;
+
+    /* Local variables */
+    doublereal temp, temp2;
+    integer i__, j, k;
+    doublereal tol3z;
+    integer itemp;
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *), zgemv_(char *, integer *, integer *, 
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *);
+    integer minmnfact;
+    extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *);
+    doublereal myhugeval;
+    integer minmnupdt;
+    extern doublereal dznrm2_(integer *, doublecomplex *, integer *);
+    integer if__;
+    extern doublereal dlamch_(char *);
+    integer kp;
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern logical disnan_(doublereal *);
+    integer lsticc;
+    extern /* Subroutine */ int zlarfg_(integer *, doublecomplex *, 
+	    doublecomplex *, integer *, doublecomplex *);
+    doublereal taunan;
+    doublecomplex aik;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --auxv;
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *nb = f2cmin(*nb,minmnfact);
+    tol3z = sqrt(dlamch_("Epsilon"));
+    myhugeval = dlamch_("Overflow");
+
+/*     Compute factorization in a while loop over NB columns, */
+/*     K is the column index in the block A(1:M,1:N). */
+
+    k = 0;
+    lsticc = 0;
+    *done = FALSE_;
+
+    while(k < *nb && lsticc == 0) {
+	++k;
+	i__ = *ioffset + k;
+
+	if (i__ == 1) {
+
+/*           We are at the first column of the original whole matrix A_orig, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+	} else {
+
+/*           Determine the pivot column in K-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__1 = *n - k + 1;
+	    kp = k - 1 + idamax_(&i__1, &vn1[k], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,K:N) in step K. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains NaN, set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (disnan_(maxc2nrmk)) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*info = *kb + kp;
+
+/*              Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix contains NaN and we stop */
+/*              the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    z__1.r = -1., z__1.i = 0.;
+		    zgemm_("No transpose", "Conjugate transpose", &i__1, nrhs,
+			     kb, &z__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 
+			    + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*              undefined elements. */
+
+/*              Return from the routine. */
+
+		return 0;
+	    }
+
+/*           Quick return, if the submatrix A(I:M,K:N) is */
+/*           a zero matrix. We need to check it only if the column index */
+/*           (same as row index) is larger than 1, since the condition */
+/*           for the whole original matrix A_orig is checked in the main */
+/*           routine. */
+
+	    if (*maxc2nrmk == 0.) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*relmaxc2nrmk = 0.;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix is zero and we stop the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */
+
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    z__1.r = -1., z__1.i = 0.;
+		    zgemm_("No transpose", "Conjugate transpose", &i__1, nrhs,
+			     kb, &z__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 
+			    + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    i__2 = j;
+		    tau[i__2].r = 0., tau[i__2].i = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + k - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third tolerance stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig; */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+
+/*              Apply the block reflector to the residual of the */
+/*              matrix A and the residual of the right hand sides B, if */
+/*              the residual matrix and and/or the residual of the right */
+/*              hand sides exist,  i.e. if the submatrix */
+/*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when */
+/*                 KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */
+
+		if (*kb < minmnupdt) {
+		    i__1 = *m - if__;
+		    i__2 = *n + *nrhs - *kb;
+		    z__1.r = -1., z__1.i = 0.;
+		    zgemm_("No transpose", "Conjugate transpose", &i__1, &
+			    i__2, kb, &z__1, &a[if__ + 1 + a_dim1], lda, &f[*
+			    kb + 1 + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*kb 
+			    + 1) * a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    i__2 = j;
+		    tau[i__2].r = 0., tau[i__2].i = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,K:N): */
+/*        1) swap the K-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */
+/*        3) copy the K-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than K in the next loop step.) */
+/*        4) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A_orig, not the block A(1:M,1:N). */
+
+	if (kp != k) {
+	    zswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+	    i__1 = k - 1;
+	    zswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf);
+	    vn1[kp] = vn1[k];
+	    vn2[kp] = vn2[k];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[k];
+	    jpiv[k] = itemp;
+	}
+
+/*        Apply previous Householder reflectors to column K: */
+/*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H. */
+
+	if (k > 1) {
+	    i__1 = k - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = k + j * f_dim1;
+		d_cnjg(&z__1, &f[k + j * f_dim1]);
+		f[i__2].r = z__1.r, f[i__2].i = z__1.i;
+	    }
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    z__1.r = -1., z__1.i = 0.;
+	    zgemv_("No transpose", &i__1, &i__2, &z__1, &a[i__ + a_dim1], lda,
+		     &f[k + f_dim1], ldf, &c_b2, &a[i__ + k * a_dim1], &c__1);
+	    i__1 = k - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = k + j * f_dim1;
+		d_cnjg(&z__1, &f[k + j * f_dim1]);
+		f[i__2].r = z__1.r, f[i__2].i = z__1.i;
+	    }
+	}
+
+/*        Generate elementary reflector H(k) using the column A(I:M,K). */
+
+	if (i__ < *m) {
+	    i__1 = *m - i__ + 1;
+	    zlarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], &
+		    c__1, &tau[k]);
+	} else {
+	    i__1 = k;
+	    tau[i__1].r = 0., tau[i__1].i = 0.;
+	}
+
+/*        Check if TAU(K) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(K) for Inf, */
+/*        since ZLARFG cannot produce TAU(KK) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by ZLARFG can contain Inf, which requires */
+/*        TAU(K) to contain NaN. Therefore, this case of generating Inf */
+/*        by ZLARFG is covered by checking TAU(K) for NaN. */
+
+	i__1 = k;
+	d__1 = tau[i__1].r;
+	if (disnan_(&d__1)) {
+	    i__1 = k;
+	    taunan = tau[i__1].r;
+	} else /* if(complicated condition) */ {
+	    d__1 = d_imag(&tau[k]);
+	    if (disnan_(&d__1)) {
+		taunan = d_imag(&tau[k]);
+	    } else {
+		taunan = 0.;
+	    }
+	}
+
+	if (disnan_(&taunan)) {
+
+	    *done = TRUE_;
+
+/*           Set KB, the number of factorized partial columns */
+/*                   that are non-zero in each step in the block, */
+/*                   i.e. the rank of the factor R. */
+/*           Set IF, the number of processed rows in the block, which */
+/*                   is the same as the number of processed rows in */
+/*                   the original whole matrix A_orig. */
+
+	    *kb = k - 1;
+	    if__ = i__ - 1;
+	    *info = k;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = taunan;
+	    *relmaxc2nrmk = taunan;
+
+/*           There is no need to apply the block reflector to the */
+/*           residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*           since the submatrix contains NaN and we stop */
+/*           the computation. */
+/*           But, we need to apply the block reflector to the residual */
+/*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*           residual right hand sides exist.  This occurs */
+/*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */
+
+	    if (*nrhs > 0 && *kb < *m - *ioffset) {
+		i__1 = *m - if__;
+		z__1.r = -1., z__1.i = 0.;
+		zgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, kb,
+			 &z__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + 
+			f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * a_dim1],
+			 lda);
+	    }
+
+/*           There is no need to recompute the 2-norm of the */
+/*           difficult columns, since we stop the factorization. */
+
+/*           Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*           undefined elements. */
+
+/*           Return from the routine. */
+
+	    return 0;
+	}
+
+/*        =============================================================== */
+
+	i__1 = i__ + k * a_dim1;
+	aik.r = a[i__1].r, aik.i = a[i__1].i;
+	i__1 = i__ + k * a_dim1;
+	a[i__1].r = 1., a[i__1].i = 0.;
+
+/*        =============================================================== */
+
+/*        Compute the current K-th column of F: */
+/*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K). */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = *n + *nrhs - k;
+	    zgemv_("Conjugate transpose", &i__1, &i__2, &tau[k], &a[i__ + (k 
+		    + 1) * a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, &
+		    f[k + 1 + k * f_dim1], &c__1);
+	}
+
+/*           2) Zero out elements above and on the diagonal of the */
+/*              column K in matrix F, i.e elements F(1:K,K). */
+
+	i__1 = k;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j + k * f_dim1;
+	    f[i__2].r = 0., f[i__2].i = 0.;
+	}
+
+/*         3) Incremental updating of the K-th column of F: */
+/*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H */
+/*                    * A(I:M,K). */
+
+	if (k > 1) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    i__3 = k;
+	    z__1.r = -tau[i__3].r, z__1.i = -tau[i__3].i;
+	    zgemv_("Conjugate Transpose", &i__1, &i__2, &z__1, &a[i__ + 
+		    a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, &auxv[1]
+		    , &c__1);
+
+	    i__1 = *n + *nrhs;
+	    i__2 = k - 1;
+	    zgemv_("No transpose", &i__1, &i__2, &c_b2, &f[f_dim1 + 1], ldf, &
+		    auxv[1], &c__1, &c_b2, &f[k * f_dim1 + 1], &c__1);
+	}
+
+/*        =============================================================== */
+
+/*        Update the current I-th row of A: */
+/*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */
+/*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**H. */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *n + *nrhs - k;
+	    z__1.r = -1., z__1.i = 0.;
+	    zgemm_("No transpose", "Conjugate transpose", &c__1, &i__1, &k, &
+		    z__1, &a[i__ + a_dim1], lda, &f[k + 1 + f_dim1], ldf, &
+		    c_b2, &a[i__ + (k + 1) * a_dim1], lda);
+	}
+
+	i__1 = i__ + k * a_dim1;
+	a[i__1].r = aik.r, a[i__1].i = aik.i;
+
+/*        Update the partial column 2-norms for the residual matrix, */
+/*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */
+/*        when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */
+
+	if (k < minmnfact) {
+
+	    i__1 = *n;
+	    for (j = k + 1; j <= i__1; ++j) {
+		if (vn1[j] != 0.) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+		    temp = z_abs(&a[i__ + j * a_dim1]) / vn1[j];
+/* Computing MAX */
+		    d__1 = 0., d__2 = (temp + 1.) * (1. - temp);
+		    temp = f2cmax(d__1,d__2);
+/* Computing 2nd power */
+		    d__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (d__1 * d__1);
+		    if (temp2 <= tol3z) {
+
+/*                    At J-index, we have a difficult column for the */
+/*                    update of the 2-norm. Save the index of the previous */
+/*                    difficult column in IWORK(J-1). */
+/*                    NOTE: ILSTCC > 1, threfore we can use IWORK only */
+/*                    with N-1 elements, where the elements are */
+/*                    shifted by 1 to the left. */
+
+			iwork[j - 1] = lsticc;
+
+/*                    Set the index of the last difficult column LSTICC. */
+
+			lsticc = j;
+
+		    } else {
+			vn1[j] *= sqrt(temp);
+		    }
+		}
+	    }
+
+	}
+
+/*        End of while loop. */
+
+    }
+
+/*     Now, afler the loop: */
+/*        Set KB, the number of factorized columns in the block; */
+/*        Set IF, the number of processed rows in the block, which */
+/*                is the same as the number of processed rows in */
+/*                the original whole matrix A_orig, IF = IOFFSET + KB. */
+
+    *kb = k;
+    if__ = i__;
+
+/*     Apply the block reflector to the residual of the matrix A */
+/*     and the residual of the right hand sides B, if the residual */
+/*     matrix and and/or the residual of the right hand sides */
+/*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */
+/*     This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */
+
+    if (*kb < minmnupdt) {
+	i__1 = *m - if__;
+	i__2 = *n + *nrhs - *kb;
+	z__1.r = -1., z__1.i = 0.;
+	zgemm_("No transpose", "Conjugate transpose", &i__1, &i__2, kb, &z__1,
+		 &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b2,
+		 &a[if__ + 1 + (*kb + 1) * a_dim1], lda);
+    }
+
+/*     Recompute the 2-norm of the difficult columns. */
+/*     Loop over the index of the difficult columns from the largest */
+/*     to the smallest index. */
+
+    while(lsticc > 0) {
+
+/*        LSTICC is the index of the last difficult column is greater */
+/*        than 1. */
+/*        ITEMP is the index of the previous difficult column. */
+
+	itemp = iwork[lsticc - 1];
+
+/*        Compute the 2-norm explicilty for the last difficult column and */
+/*        save it in the partial and exact 2-norm vectors VN1 and VN2. */
+
+/*        NOTE: The computation of VN1( LSTICC ) relies on the fact that */
+/*        DZNRM2 does not fail on vectors with norm below the value of */
+/*        SQRT(DLAMCH('S')) */
+
+	i__1 = *m - if__;
+	vn1[lsticc] = dznrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1);
+	vn2[lsticc] = vn1[lsticc];
+
+/*        Downdate the index of the last difficult column to */
+/*        the index of the previous difficult column. */
+
+	lsticc = itemp;
+
+    }
+
+    return 0;
+
+/*     End of ZLAQP3RK */
+
+} /* zlaqp3rk_ */
+
diff --git a/lapack-netlib/SRC/zlaqp3rk.f b/lapack-netlib/SRC/zlaqp3rk.f
new file mode 100644
index 000000000..0dd8bf8e3
--- /dev/null
+++ b/lapack-netlib/SRC/zlaqp3rk.f
@@ -0,0 +1,947 @@
+*> \brief \b ZLAQP3RK computes a step of truncated QR factorization with column pivoting of a complex m-by-n matrix A using Level 3 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZLAQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zlaqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zlaqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zlaqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE ZLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+*     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+*     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+*      IMPLICIT NONE
+*      LOGICAL            DONE
+*      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+*     $                   NB, NRHS
+*      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            IWORK( * ), JPIV( * )
+*      DOUBLE PRECISION   VN1( * ), VN2( * )
+*      COMPLEX*16         A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLAQP3RK computes a step of truncated QR factorization with column
+*> pivoting of a complex M-by-N matrix A block A(IOFFSET+1:M,1:N)
+*> by using Level 3 BLAS as
+*>
+*>   A * P(KB) = Q(KB) * R(KB).
+*>
+*> The routine tries to factorize NB columns from A starting from
+*> the row IOFFSET+1 and updates the residual matrix with BLAS 3
+*> xGEMM. The number of actually factorized columns is returned
+*> is smaller than NB.
+*>
+*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides B matrix stored
+*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**H * B.
+*>
+*> Cases when the number of factorized columns KB < NB:
+*>
+*> (1) In some cases, due to catastrophic cancellations, it cannot
+*> factorize all NB columns and need to update the residual matrix.
+*> Hence, the actual number of factorized columns in the block returned
+*> in KB is smaller than NB. The logical DONE is returned as FALSE.
+*> The factorization of the whole original matrix A_orig must proceed
+*> with the next block.
+*>
+*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB.
+*>
+*> (3) In case both stopping criteria ABSTOL or RELTOL are not used,
+*> and when the residual matrix is a zero matrix in some factorization
+*> step KB, the factorization of the whole original matrix A_orig is
+*> stopped, the logical DONE is returned as TRUE. The number of
+*> factorized columns which is smaller than NB is returned in KB.
+*>
+*> (4) Whenever NaN is detected in the matrix A or in the array TAU,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB. The INFO
+*> parameter is set to the column index of the first NaN occurrence.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          Factorization block size, i.e the number of columns
+*>          to factorize in the matrix A. 0 <= NB
+*>
+*>          If NB = 0, then the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine ZGEQP3RK. 1 <= KP1 <= N_orig.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is DOUBLE PRECISION
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine ZGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:KB) below
+*>             the diagonal together with the array TAU represent
+*>             the unitary matrix Q(KB) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:KB) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,KB+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(KB)**H.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out]
+*> \verbatim
+*>          DONE is LOGICAL
+*>          TRUE: a) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to ABSTOL
+*>                   or RELTOL criterion,
+*>                b) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to the
+*>                   residual matrix being a ZERO matrix.
+*>                c) when NaN was detected in the matrix A
+*>                   or in the array TAU.
+*>          FALSE: otherwise.
+*> \endverbatim
+*>
+*> \param[out] KB
+*> \verbatim
+*>          KB is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R.  0 <= KB <= min(M-IOFFSET,NB,N).
+*>
+*>          KB also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank KB. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank KB) to the maximum column 2-norm of the
+*>          original matrix A_orig. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX*16 array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] AUXV
+*> \verbatim
+*>          AUXV is COMPLEX*16 array, dimension (NB)
+*>          Auxiliary vector.
+*> \endverbatim
+*>
+*> \param[out] F
+*> \verbatim
+*>          F is COMPLEX*16 array, dimension (LDF,NB)
+*>          Matrix F**H = L*(Y**H)*A.
+*> \endverbatim
+*>
+*> \param[in] LDF
+*> \verbatim
+*>          LDF is INTEGER
+*>          The leading dimension of the array F. LDF >= max(1,N+NRHS).
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step KB+1 ( when KB columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             KB                  is set to the number of
+*>                                    factorized columns without
+*>                                    exception.
+*>             MAXC2NRMK           is set to NaN.
+*>             RELMAXC2NRMK        is set to NaN.
+*>             TAU(KB+1:min(M,N))     is not set and contains undefined
+*>                                    elements. If j_1=KB+1, TAU(KB+1)
+*>                                    may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the actorization
+*>             step KB+1 ( when KB columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp3rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      LOGICAL            DONE
+      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+     $                   NB, NRHS
+      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      DOUBLE PRECISION   VN1( * ), VN2( * )
+      COMPLEX*16         A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+      COMPLEX*16         CZERO, CONE
+      PARAMETER          ( CZERO = ( 0.0D+0, 0.0D+0 ),
+     $                   CONE = ( 1.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            ITEMP, J, K, MINMNFACT, MINMNUPDT,
+     $                   LSTICC, KP, I, IF
+      DOUBLE PRECISION   HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z
+      COMPLEX*16         AIK
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZGEMM, ZGEMV, ZLARFG, ZSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, DCONJG, DIMAG, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      INTEGER            IDAMAX
+      DOUBLE PRECISION   DLAMCH, DZNRM2
+      EXTERNAL           DISNAN, DLAMCH, IDAMAX, DZNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      NB = MIN( NB, MINMNFACT )
+      TOL3Z = SQRT( DLAMCH( 'Epsilon' ) )
+      HUGEVAL = DLAMCH( 'Overflow' )
+*
+*     Compute factorization in a while loop over NB columns,
+*     K is the column index in the block A(1:M,1:N).
+*
+      K = 0
+      LSTICC = 0
+      DONE = .FALSE.
+*
+      DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 )
+         K = K + 1
+         I = IOFFSET + K
+*
+         IF( I.EQ.1 ) THEN
+*
+*           We are at the first column of the original whole matrix A_orig,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+            KP = KP1
+*
+         ELSE
+*
+*           Determine the pivot column in K-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( K-1 ) + IDAMAX( N-K+1, VN1( K ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,K:N) in step K.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains NaN, set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( DISNAN( MAXC2NRMK ) ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               INFO = KB + KP
+*
+*              Set RELMAXC2NRMK to NaN.
+*
+               RELMAXC2NRMK = MAXC2NRMK
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix contains NaN and we stop
+*              the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H.
+
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $                  M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Array TAU(KF+1:MINMNFACT) is not set and contains
+*              undefined elements.
+*
+*              Return from the routine.
+*
+               RETURN
+            END IF
+*
+*           Quick return, if the submatrix A(I:M,K:N) is
+*           a zero matrix. We need to check it only if the column index
+*           (same as row index) is larger than 1, since the condition
+*           for the whole original matrix A_orig is checked in the main
+*           routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               RELMAXC2NRMK = ZERO
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix is zero and we stop the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H.
+*
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $                  M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + K - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third tolerance stopping criteria.
+*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig;
+*
+                  KB = K - 1
+                  IF = I - 1
+*
+*              Apply the block reflector to the residual of the
+*              matrix A and the residual of the right hand sides B, if
+*              the residual matrix and and/or the residual of the right
+*              hand sides exist,  i.e. if the submatrix
+*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when
+*                 KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H.
+*
+               IF( KB.LT.MINMNUPDT ) THEN
+                  CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $                  M-IF, N+NRHS-KB, KB,-CONE, A( IF+1, 1 ), LDA,
+     $                  F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,K:N):
+*        1) swap the K-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1)
+*        3) copy the K-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than K in the next loop step.)
+*        4) Save the pivot interchange with the indices relative to the
+*           the original matrix A_orig, not the block A(1:M,1:N).
+*
+         IF( KP.NE.K ) THEN
+            CALL ZSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 )
+            CALL ZSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF )
+            VN1( KP ) = VN1( K )
+            VN2( KP ) = VN2( K )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( K )
+            JPIV( K ) = ITEMP
+         END IF
+*
+*        Apply previous Householder reflectors to column K:
+*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H.
+*
+         IF( K.GT.1 ) THEN
+            DO J = 1, K - 1
+               F( K, J ) = DCONJG( F( K, J ) )
+            END DO
+            CALL ZGEMV( 'No transpose', M-I+1, K-1, -CONE, A( I, 1 ),
+     $                  LDA, F( K, 1 ), LDF, CONE, A( I, K ), 1 )
+            DO J = 1, K - 1
+               F( K, J ) = DCONJG( F( K, J ) )
+            END DO
+         END IF
+*
+*        Generate elementary reflector H(k) using the column A(I:M,K).
+*
+         IF( I.LT.M ) THEN
+            CALL ZLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) )
+         ELSE
+            TAU( K ) = CZERO
+         END IF
+*
+*        Check if TAU(K) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(K) for Inf,
+*        since ZLARFG cannot produce TAU(KK) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by ZLARFG can contain Inf, which requires
+*        TAU(K) to contain NaN. Therefore, this case of generating Inf
+*        by ZLARFG is covered by checking TAU(K) for NaN.
+*
+         IF( DISNAN( DBLE( TAU(K) ) ) ) THEN
+            TAUNAN = DBLE( TAU(K) )
+         ELSE IF( DISNAN( DIMAG( TAU(K) ) ) ) THEN
+            TAUNAN = DIMAG( TAU(K) )
+         ELSE
+            TAUNAN = ZERO
+         END IF
+*
+         IF( DISNAN( TAUNAN ) ) THEN
+*
+            DONE = .TRUE.
+*
+*           Set KB, the number of factorized partial columns
+*                   that are non-zero in each step in the block,
+*                   i.e. the rank of the factor R.
+*           Set IF, the number of processed rows in the block, which
+*                   is the same as the number of processed rows in
+*                   the original whole matrix A_orig.
+*
+            KB = K - 1
+            IF = I - 1
+            INFO = K
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAUNAN
+            RELMAXC2NRMK = TAUNAN
+*
+*           There is no need to apply the block reflector to the
+*           residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*           since the submatrix contains NaN and we stop
+*           the computation.
+*           But, we need to apply the block reflector to the residual
+*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*           residual right hand sides exist.  This occurs
+*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H.
+*
+            IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+               CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $               M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA,
+     $               F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA )
+            END IF
+*
+*           There is no need to recompute the 2-norm of the
+*           difficult columns, since we stop the factorization.
+*
+*           Array TAU(KF+1:MINMNFACT) is not set and contains
+*           undefined elements.
+*
+*           Return from the routine.
+*
+            RETURN
+         END IF
+*
+*        ===============================================================
+*
+         AIK = A( I, K )
+         A( I, K ) = CONE
+*
+*        ===============================================================
+*
+*        Compute the current K-th column of F:
+*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K).
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL ZGEMV( 'Conjugate transpose', M-I+1, N+NRHS-K,
+     $                  TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1,
+     $                  CZERO, F( K+1, K ), 1 )
+         END IF
+*
+*           2) Zero out elements above and on the diagonal of the
+*              column K in matrix F, i.e elements F(1:K,K).
+*
+         DO J = 1, K
+            F( J, K ) = CZERO
+         END DO
+*
+*         3) Incremental updating of the K-th column of F:
+*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H
+*                    * A(I:M,K).
+*
+         IF( K.GT.1 ) THEN
+            CALL ZGEMV( 'Conjugate Transpose', M-I+1, K-1, -TAU( K ),
+     $                   A( I, 1 ), LDA, A( I, K ), 1, CZERO,
+     $                   AUXV( 1 ), 1 )
+*
+            CALL ZGEMV( 'No transpose', N+NRHS, K-1, CONE,
+     $                  F( 1, 1 ), LDF, AUXV( 1 ), 1, CONE,
+     $                  F( 1, K ), 1 )
+         END IF
+*
+*        ===============================================================
+*
+*        Update the current I-th row of A:
+*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS)
+*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**H.
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $            1, N+NRHS-K, K, -CONE, A( I, 1 ), LDA,
+     $            F( K+1, 1 ), LDF, CONE, A( I, K+1 ), LDA )
+         END IF
+*
+         A( I, K ) = AIK
+*
+*        Update the partial column 2-norms for the residual matrix,
+*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e.
+*        when K < MINMNFACT = min( M-IOFFSET, N ).
+*
+         IF( K.LT.MINMNFACT ) THEN
+*
+            DO J = K + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ABS( A( I, J ) ) / VN1( J )
+                  TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2.LE.TOL3Z ) THEN
+*
+*                    At J-index, we have a difficult column for the
+*                    update of the 2-norm. Save the index of the previous
+*                    difficult column in IWORK(J-1).
+*                    NOTE: ILSTCC > 1, threfore we can use IWORK only
+*                    with N-1 elements, where the elements are
+*                    shifted by 1 to the left.
+*
+                     IWORK( J-1 ) = LSTICC
+*
+*                    Set the index of the last difficult column LSTICC.
+*
+                     LSTICC = J
+*
+                  ELSE
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*        End of while loop.
+*
+      END DO
+*
+*     Now, afler the loop:
+*        Set KB, the number of factorized columns in the block;
+*        Set IF, the number of processed rows in the block, which
+*                is the same as the number of processed rows in
+*                the original whole matrix A_orig, IF = IOFFSET + KB.
+*
+      KB = K
+      IF = I
+*
+*     Apply the block reflector to the residual of the matrix A
+*     and the residual of the right hand sides B, if the residual
+*     matrix and and/or the residual of the right hand sides
+*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists.
+*     This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H.
+*
+      IF( KB.LT.MINMNUPDT ) THEN
+         CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $         M-IF, N+NRHS-KB, KB, -CONE, A( IF+1, 1 ), LDA,
+     $         F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA )
+      END IF
+*
+*     Recompute the 2-norm of the difficult columns.
+*     Loop over the index of the difficult columns from the largest
+*     to the smallest index.
+*
+      DO WHILE( LSTICC.GT.0 )
+*
+*        LSTICC is the index of the last difficult column is greater
+*        than 1.
+*        ITEMP is the index of the previous difficult column.
+*
+         ITEMP = IWORK( LSTICC-1 )
+*
+*        Compute the 2-norm explicilty for the last difficult column and
+*        save it in the partial and exact 2-norm vectors VN1 and VN2.
+*
+*        NOTE: The computation of VN1( LSTICC ) relies on the fact that
+*        DZNRM2 does not fail on vectors with norm below the value of
+*        SQRT(DLAMCH('S'))
+*
+         VN1( LSTICC ) = DZNRM2( M-IF, A( IF+1, LSTICC ), 1 )
+         VN2( LSTICC ) = VN1( LSTICC )
+*
+*        Downdate the index of the last difficult column to
+*        the index of the previous difficult column.
+*
+         LSTICC = ITEMP
+*
+      END DO
+*
+      RETURN
+*
+*     End of ZLAQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/zlaswlq.f b/lapack-netlib/SRC/zlaswlq.f
index be4c48539..735207132 100644
--- a/lapack-netlib/SRC/zlaswlq.f
+++ b/lapack-netlib/SRC/zlaswlq.f
@@ -96,22 +96,23 @@
 *>          The leading dimension of the array T.  LDT >= MB.
 *> \endverbatim
 *>
-*>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= MB*M.
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -159,33 +160,37 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup laswlq
+*>
 *  =====================================================================
       SUBROUTINE ZLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
-     $                  INFO)
+     $                    INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LWORK, LDT
+      INTEGER            INFO, LDA, M, N, MB, NB, LWORK, LDT
 *     ..
 *     .. Array Arguments ..
-      COMPLEX*16        A( LDA, * ), WORK( * ), T( LDT, *)
+      COMPLEX*16         A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, MINMN, LWMIN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
       EXTERNAL           ZGELQT, ZTPLQT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -196,12 +201,19 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = M*MB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
       ELSE IF( N.LT.0 .OR. N.LT.M ) THEN
         INFO = -2
-      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN
+      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN
         INFO = -3
       ELSE IF( NB.LE.0 ) THEN
         INFO = -4
@@ -209,60 +221,61 @@
         INFO = -6
       ELSE IF( LDT.LT.MB ) THEN
         INFO = -8
-      ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-      WORK(1) = MB*M
+*
+      IF( INFO.EQ.0 )  THEN
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'ZLASWLQ', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The LQ Decomposition
 *
-       IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN
-        CALL ZGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO)
+      IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN
+        CALL ZGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
-       KK = MOD((N-M),(NB-M))
-       II=N-KK+1
+      KK = MOD((N-M),(NB-M))
+      II = N-KK+1
 *
-*      Compute the LQ factorization of the first block A(1:M,1:NB)
+*     Compute the LQ factorization of the first block A(1:M,1:NB)
 *
-       CALL ZGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO)
-       CTR = 1
+      CALL ZGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CTR = 1
 *
-       DO I = NB+1, II-NB+M , (NB-M)
+      DO I = NB+1, II-NB+M, (NB-M)
 *
-*      Compute the QR factorization of the current block A(1:M,I:I+NB-M)
+*       Compute the QR factorization of the current block A(1:M,I:I+NB-M)
 *
-         CALL ZTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
-     $                  LDA, T(1, CTR * M + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CALL ZTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
+     $                LDA, T(1, CTR * M + 1),
+     $                LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
 *     Compute the QR factorization of the last block A(1:M,II:N)
 *
-       IF (II.LE.N) THEN
+      IF( II.LE.N ) THEN
         CALL ZTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ),
-     $                  LDA, T(1, CTR * M + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+     $                LDA, T(1, CTR * M + 1), LDT,
+     $                WORK, INFO )
+      END IF
 *
-      WORK( 1 ) = M * MB
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of ZLASWLQ
diff --git a/lapack-netlib/SRC/zlatrs3.f b/lapack-netlib/SRC/zlatrs3.f
index 231a17274..27eac839b 100644
--- a/lapack-netlib/SRC/zlatrs3.f
+++ b/lapack-netlib/SRC/zlatrs3.f
@@ -158,7 +158,11 @@
 *> \endverbatim
 *>
 *> \param[in] LWORK
+*> \verbatim
 *>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else
 *>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
 *>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
 *>
@@ -166,6 +170,7 @@
 *>          only calculates the optimal dimensions of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
@@ -182,7 +187,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERauxiliary
+*> \ingroup latrs3
 *> \par Further Details:
 *  =====================
 *  \verbatim
@@ -257,7 +262,7 @@
       LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
       INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
      $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
-     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
+     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN
       DOUBLE PRECISION   ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
      $                   SCAMIN, SMLNUM, TMAX
 *     ..
@@ -296,15 +301,24 @@
 *     row. WORK( I + KK * LDS ) is the scale factor of the vector
 *     segment associated with the I-th block row and the KK-th vector
 *     in the block column.
+*
       LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
       LDS = NBA
+*
 *     The second part stores upper bounds of the triangular A. There are
 *     a total of NBA x NBA blocks, of which only the upper triangular
 *     part or the lower triangular part is referenced. The upper bound of
 *     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
+*
       LANRM = NBA * NBA
       AWRK = LSCALE
-      WORK( 1 ) = LSCALE + LANRM
+*
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = LSCALE + LANRM
+      END IF
+      WORK( 1 ) = LWMIN
 *
 *     Test the input parameters.
 *
@@ -326,7 +340,7 @@
          INFO = -8
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -10
-      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
+      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN
          INFO = -14
       END IF
       IF( INFO.NE.0 ) THEN
diff --git a/lapack-netlib/SRC/zlatsqr.f b/lapack-netlib/SRC/zlatsqr.f
index 8c938aebc..24d00f28a 100644
--- a/lapack-netlib/SRC/zlatsqr.f
+++ b/lapack-netlib/SRC/zlatsqr.f
@@ -101,15 +101,18 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= NB*N.
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -161,33 +164,37 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup latsqr
+*>
 *  =====================================================================
       SUBROUTINE ZLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK,
-     $                    LWORK, INFO)
+     $                    LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LDT, LWORK
+      INTEGER            INFO, LDA, M, N, MB, NB, LDT, LWORK
 *     ..
 *     .. Array Arguments ..
-      COMPLEX*16        A( LDA, * ), WORK( * ), T(LDT, *)
+      COMPLEX*16         A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, LWMIN, MINMN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
-      EXTERNAL    ZGEQRT, ZTPQRT, XERBLA
+      EXTERNAL           ZGEQRT, ZTPQRT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -198,6 +205,13 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = N*NB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
@@ -205,64 +219,65 @@
         INFO = -2
       ELSE IF( MB.LT.1 ) THEN
         INFO = -3
-      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN
+      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN
         INFO = -4
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
       ELSE IF( LDT.LT.NB ) THEN
         INFO = -8
-      ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-        WORK(1) = NB*N
+*
+      IF( INFO.EQ.0 )  THEN
+        WORK( 1 ) = LWMIN
       END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'ZLATSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The QR Decomposition
 *
-       IF ((MB.LE.N).OR.(MB.GE.M)) THEN
-         CALL ZGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO)
-         RETURN
-       END IF
-       KK = MOD((M-N),(MB-N))
-       II=M-KK+1
+      IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN
+        CALL ZGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO )
+        RETURN
+      END IF
+      KK = MOD((M-N),(MB-N))
+      II = M-KK+1
 *
-*      Compute the QR factorization of the first block A(1:MB,1:N)
+*     Compute the QR factorization of the first block A(1:MB,1:N)
 *
-       CALL ZGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
-       CTR = 1
+      CALL ZGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CTR = 1
 *
-       DO I = MB+1, II-MB+N ,  (MB-N)
+      DO I = MB+1, II-MB+N, (MB-N)
 *
-*      Compute the QR factorization of the current block A(I:I+MB-N,1:N)
+*       Compute the QR factorization of the current block A(I:I+MB-N,1:N)
 *
-         CALL ZTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
-     $                 T(1, CTR * N + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CALL ZTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
+     $                T(1, CTR * N + 1),
+     $                LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
-*      Compute the QR factorization of the last block A(II:M,1:N)
+*     Compute the QR factorization of the last block A(II:M,1:N)
 *
-       IF (II.LE.M) THEN
-         CALL ZTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
-     $                 T(1,CTR * N + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+      IF( II.LE.M ) THEN
+        CALL ZTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
+     $                T(1,CTR * N + 1), LDT,
+     $                WORK, INFO )
+      END IF
 *
-      work( 1 ) = N*NB
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of ZLATSQR
diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt
index b4e2223f7..d4e6f970d 100644
--- a/lapack-netlib/TESTING/CMakeLists.txt
+++ b/lapack-netlib/TESTING/CMakeLists.txt
@@ -54,6 +54,9 @@ add_lapack_test(sgqr.out gqr.in xeigtsts)
 add_lapack_test(sgsv.out gsv.in xeigtsts)
 add_lapack_test(scsd.out csd.in xeigtsts)
 add_lapack_test(slse.out lse.in xeigtsts)
+#
+# ======== SINGLE DMD EIG TESTS ===========================
+add_lapack_test(sdmd.out sdmd.in xdmdeigtsts)
 endif()
 
 if(BUILD_DOUBLE)
@@ -85,6 +88,9 @@ add_lapack_test(dgqr.out gqr.in xeigtstd)
 add_lapack_test(dgsv.out gsv.in xeigtstd)
 add_lapack_test(dcsd.out csd.in xeigtstd)
 add_lapack_test(dlse.out lse.in xeigtstd)
+#
+# ======== DOUBLE DMD EIG TESTS ===========================
+add_lapack_test(ddmd.out ddmd.in xdmdeigtstd)
 endif()
 
 if(BUILD_COMPLEX)
@@ -114,6 +120,9 @@ add_lapack_test(cgqr.out gqr.in xeigtstc)
 add_lapack_test(cgsv.out gsv.in xeigtstc)
 add_lapack_test(ccsd.out csd.in xeigtstc)
 add_lapack_test(clse.out lse.in xeigtstc)
+#
+# ======== COMPLEX DMD EIG TESTS ===========================
+add_lapack_test(cdmd.out cdmd.in xdmdeigtstc)
 endif()
 
 if(BUILD_COMPLEX16)
@@ -145,6 +154,9 @@ add_lapack_test(zgqr.out gqr.in xeigtstz)
 add_lapack_test(zgsv.out gsv.in xeigtstz)
 add_lapack_test(zcsd.out csd.in xeigtstz)
 add_lapack_test(zlse.out lse.in xeigtstz)
+#
+# ======== COMPLEX16 DMD EIG TESTS ===========================
+add_lapack_test(zdmd.out zdmd.in xdmdeigtstz)
 endif()
 
 
diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt
index d252c7fa9..e7236677a 100644
--- a/lapack-netlib/TESTING/EIG/CMakeLists.txt
+++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt
@@ -42,6 +42,8 @@ set(SEIGTST schkee.F
    sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f
    sstt22.f ssyl01.f ssyt21.f ssyt22.f)
 
+set(SDMDEIGTST schkdmd.f90)
+
 set(CEIGTST cchkee.F
    cbdt01.f cbdt02.f cbdt03.f cbdt05.f
    cchkbb.f cchkbd.f cchkbk.f cchkbl.f cchkec.f
@@ -59,6 +61,8 @@ set(CEIGTST cchkee.F
    csgt01.f cslect.f csyl01.f
    cstt21.f cstt22.f cunt01.f cunt03.f)
 
+set(CDMDEIGTST cchkdmd.f90)
+
 set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f
    dsvdch.f dsvdct.f dsxt1.f)
 
@@ -79,6 +83,8 @@ set(DEIGTST dchkee.F
    dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f
    dstt22.f dsyl01.f dsyt21.f dsyt22.f)
 
+set(DDMDEIGTST dchkdmd.f90)
+
 set(ZEIGTST zchkee.F
    zbdt01.f zbdt02.f zbdt03.f zbdt05.f
    zchkbb.f zchkbd.f zchkbk.f zchkbl.f zchkec.f
@@ -96,6 +102,8 @@ set(ZEIGTST zchkee.F
    zsgt01.f zslect.f zsyl01.f
    zstt21.f zstt22.f zunt01.f zunt03.f)
 
+set(ZDMDEIGTST zchkdmd.f90)
+
 macro(add_eig_executable name)
   add_executable(${name} ${ARGN})
   target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE})
@@ -104,16 +112,20 @@ endmacro()
 
 if(BUILD_SINGLE)
 add_eig_executable(xeigtsts ${SEIGTST} ${SCIGTST} ${AEIGTST})
+add_eig_executable(xdmdeigtsts ${SDMDEIGTST})
 endif()
 
 if(BUILD_COMPLEX)
 add_eig_executable(xeigtstc ${CEIGTST} ${SCIGTST} ${AEIGTST})
+add_eig_executable(xdmdeigtstc ${CDMDEIGTST})
 endif()
 
 if(BUILD_DOUBLE)
 add_eig_executable(xeigtstd ${DEIGTST} ${DZIGTST} ${AEIGTST})
+add_eig_executable(xdmdeigtstd ${DDMDEIGTST})
 endif()
 
 if(BUILD_COMPLEX16)
 add_eig_executable(xeigtstz ${ZEIGTST} ${DZIGTST} ${AEIGTST})
+add_eig_executable(xdmdeigtstz ${ZDMDEIGTST})
 endif()
diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile
index 942ae6982..9cf0fc95e 100644
--- a/lapack-netlib/TESTING/EIG/Makefile
+++ b/lapack-netlib/TESTING/EIG/Makefile
@@ -64,6 +64,8 @@ SEIGTST = schkee.o \
    sort03.o ssbt21.o ssgt01.o sslect.o sspt21.o sstt21.o \
    sstt22.o ssyl01.o ssyt21.o ssyt22.o
 
+SDMDEIGTST = schkdmd.o
+
 CEIGTST = cchkee.o \
    cbdt01.o cbdt02.o cbdt03.o cbdt05.o \
    cchkbb.o cchkbd.o cchkbk.o cchkbl.o cchkec.o \
@@ -81,6 +83,8 @@ CEIGTST = cchkee.o \
    csgt01.o cslect.o csyl01.o\
    cstt21.o cstt22.o cunt01.o cunt03.o
 
+CDMDEIGTST = cchkdmd.o
+
 DZIGTST = dlafts.o dlahd2.o dlasum.o dlatb9.o dstech.o dstect.o \
    dsvdch.o dsvdct.o dsxt1.o
 
@@ -101,6 +105,8 @@ DEIGTST = dchkee.o \
    dort03.o dsbt21.o dsgt01.o dslect.o dspt21.o dstt21.o \
    dstt22.o dsyl01.o dsyt21.o dsyt22.o
 
+DDMDEIGTST = dchkdmd.o
+
 ZEIGTST = zchkee.o \
    zbdt01.o zbdt02.o zbdt03.o zbdt05.o \
    zchkbb.o zchkbd.o zchkbk.o zchkbl.o zchkec.o \
@@ -118,14 +124,28 @@ ZEIGTST = zchkee.o \
    zsgt01.o zslect.o zsyl01.o\
    zstt21.o zstt22.o zunt01.o zunt03.o
 
+ZDMDEIGTST = zchkdmd.o
+
 .PHONY: all
 all: single complex double complex16
 
 .PHONY: single complex double complex16
-single: xeigtsts
-complex: xeigtstc
-double: xeigtstd
-complex16: xeigtstz
+single: xeigtsts xdmdeigtsts
+complex: xeigtstc xdmdeigtstc
+double: xeigtstd xdmdeigtstd
+complex16: xeigtstz xdmdeigtstz
+
+xdmdeigtsts: $(SDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+
+xdmdeigtstc: $(CDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+
+xdmdeigtstd: $(DDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+
+xdmdeigtstz: $(ZDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
 	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
@@ -139,6 +159,10 @@ xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
 xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
 	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
+$(SDMDEIGTST): $(FRC)
+$(CDMDEIGTST): $(FRC)
+$(DDMDEIGTST): $(FRC)
+$(ZDMDEIGTST): $(FRC)
 $(AEIGTST): $(FRC)
 $(SCIGTST): $(FRC)
 $(DZIGTST): $(FRC)
@@ -155,7 +179,7 @@ clean: cleanobj cleanexe
 cleanobj:
 	rm -f *.o
 cleanexe:
-	rm -f xeigtst*
+	rm -f xeigtst* xdmdeigtst*
 
 schkee.o: schkee.F
 	$(FC) $(FFLAGS_DRV) -c -o $@ $<
@@ -165,3 +189,11 @@ cchkee.o: cchkee.F
 	$(FC) $(FFLAGS_DRV) -c -o $@ $<
 zchkee.o: zchkee.F
 	$(FC) $(FFLAGS_DRV) -c -o $@ $<
+schkdmd.o: schkdmd.f90
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
+cchkdmd.o: cchkdmd.f90
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
+dchkdmd.o: dchkdmd.f90
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
+zchkdmd.o: zchkdmd.f90
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
diff --git a/lapack-netlib/TESTING/EIG/alareq.f b/lapack-netlib/TESTING/EIG/alareq.f
index 2585a686a..2cbe6db38 100644
--- a/lapack-netlib/TESTING/EIG/alareq.f
+++ b/lapack-netlib/TESTING/EIG/alareq.f
@@ -28,12 +28,12 @@
 *> to evaluate the input line which requested NMATS matrix types for
 *> PATH.  The flow of control is as follows:
 *>
-*> If NMATS = NTYPES then
+*> IF NMATS = NTYPES THEN
 *>    DOTYPE(1:NTYPES) = .TRUE.
-*> else
+*> ELSE
 *>    Read the next input line for NMATS matrix types
 *>    Set DOTYPE(I) = .TRUE. for each valid type I
-*> endif
+*> END IF
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/TESTING/EIG/alarqg.f b/lapack-netlib/TESTING/EIG/alarqg.f
index 6e2e6e7ec..b9fb88c65 100644
--- a/lapack-netlib/TESTING/EIG/alarqg.f
+++ b/lapack-netlib/TESTING/EIG/alarqg.f
@@ -28,12 +28,12 @@
 *> to evaluate the input line which requested NMATS matrix types for
 *> PATH.  The flow of control is as follows:
 *>
-*> If NMATS = NTYPES then
+*> IF NMATS = NTYPES THEN
 *>    DOTYPE(1:NTYPES) = .TRUE.
-*> else
+*> ELSE
 *>    Read the next input line for NMATS matrix types
 *>    Set DOTYPE(I) = .TRUE. for each valid type I
-*> endif
+*> END IF
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/TESTING/EIG/cchkdmd.f90 b/lapack-netlib/TESTING/EIG/cchkdmd.f90
new file mode 100644
index 000000000..a9c181da9
--- /dev/null
+++ b/lapack-netlib/TESTING/EIG/cchkdmd.f90
@@ -0,0 +1,721 @@
+!   This is a test program for checking the implementations of
+!   the implementations of the following subroutines
+!
+!   CGEDMD,  for computation of the
+!            Dynamic Mode Decomposition (DMD)
+!   CGEDMDQ, for computation of a
+!            QR factorization based compressed DMD
+!
+!   Developed and supported by:
+!   ===========================
+!   Developed and coded by Zlatko Drmac, Faculty of Science,
+!   University of Zagreb;  drmac@math.hr
+!   In cooperation with
+!   AIMdyn Inc., Santa Barbara, CA.
+!   ========================================================
+!   How to run the code (compiler, link info)
+!   ========================================================
+!   Compile as FORTRAN 90 (or later) and link with BLAS and
+!   LAPACK libraries.
+!   NOTE: The code is developed and tested on top of the
+!   Intel MKL library (versions 2022.0.3 and 2022.2.0),
+!   using the Intel Fortran compiler.
+!
+!   For developers of the C++ implementation
+!   ========================================================
+!   See the LAPACK++ and Template Numerical Toolkit (TNT)
+!
+!   Note on a development of the GPU HP implementation
+!   ========================================================
+!   Work in progress. See CUDA, MAGMA, SLATE.
+!   NOTE: The four SVD subroutines used in this code are
+!   included as a part of R&D and for the completeness.
+!   This was also an opportunity to test those SVD codes.
+!   If the scaling option is used all four are essentially
+!   equally good. For implementations on HP platforms,
+!   one can use whichever SVD is available.
+!............................................................
+
+!............................................................
+!............................................................
+!
+      PROGRAM DMD_TEST
+
+      use iso_fortran_env
+      IMPLICIT NONE
+      integer, parameter :: WP = real32
+!............................................................
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+
+      COMPLEX(KIND=WP), PARAMETER ::  CONE = ( 1.0_WP, 0.0_WP )
+      COMPLEX(KIND=WP), PARAMETER :: CZERO = ( 0.0_WP, 0.0_WP )
+!............................................................
+      REAL(KIND=WP), ALLOCATABLE, DIMENSION(:)   :: RES, &
+                     RES1, RESEX, SINGVX, SINGVQX, WORK
+      INTEGER      , ALLOCATABLE, DIMENSION(:)   ::   IWORK
+      REAL(KIND=WP) :: WDUMMY(2)
+      INTEGER       :: IDUMMY(4), ISEED(4)
+      REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, EPS,       &
+                       TOL, TOL2, SVDIFF, TMP, TMP_AU,       &
+                       TMP_FQR, TMP_REZ, TMP_REZQ,  TMP_XW, &
+                       TMP_EX
+!............................................................
+      COMPLEX(KIND=WP) :: CMAX
+      INTEGER :: LCWORK
+      COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:,:) ::  A, AC,  &
+                                 AU, F, F0, F1, S, W,  &
+                                 X, X0, Y, Y0, Y1, Z, Z1
+      COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:)   ::  CDA, CDR, &
+                                       CDL, CEIGS, CEIGSA, CWORK
+      COMPLEX(KIND=WP) ::  CDUMMY(22), CDUM2X2(2,2)
+!............................................................
+      INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY,  &
+                 LDZ, LIWORK, LWORK, M, N, LLOOP, NRNK
+      INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, j,     &
+                 NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ,     &
+                 NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL,  &
+                 NFAIL_Z_XV,  MODE, MODEL, MODER, WHTSVD
+      INTEGER :: iNRNK, iWHTSVD,  K_traj, LWMINOPT
+      CHARACTER :: GRADE, JOBREF, JOBZ, PIVTNG, RSIGN,   &
+                   SCALE, RESIDS, WANTQ, WANTR
+      LOGICAL :: TEST_QRDMD
+
+!..... external subroutines (BLAS and LAPACK)
+      EXTERNAL CAXPY, CGEEV, CGEMM, CGEMV, CLASCL
+!.....external subroutines DMD package
+!     subroutines under test
+      EXTERNAL CGEDMD, CGEDMDQ
+!..... external functions (BLAS and LAPACK)
+      EXTERNAL         SCNRM2, SLAMCH
+      REAL(KIND=WP) :: SCNRM2, SLAMCH
+      EXTERNAL         CLANGE
+      REAL(KIND=WP) :: CLANGE
+      EXTERNAL ICAMAX
+      INTEGER  ICAMAX
+      EXTERNAL LSAME
+      LOGICAL  LSAME
+
+      INTRINSIC ABS, INT, MIN, MAX, SIGN
+!............................................................
+
+
+      WRITE(*,*) 'COMPLEX CODE TESTING'
+
+      ! The test is always in pairs : ( CGEDMD and CGEDMDQ)
+      ! because the test includes comparing the results (in pairs).
+!.....................................................................................
+      ! This code by default performs tests on CGEDMDQ
+      ! Since the QR factorizations based algorithm is designed for
+      ! single trajectory data, only single trajectory tests will
+      ! be performed with xGEDMDQ.
+
+      WANTQ = 'Q'
+      WANTR = 'R'
+!.................................................................................
+
+      EPS = SLAMCH( 'P' )  ! machine precision WP
+
+      ! Global counters of failures of some particular tests
+      NFAIL      = 0
+      NFAIL_REZ  = 0
+      NFAIL_REZQ = 0
+      NFAIL_Z_XV = 0
+      NFAIL_F_QR = 0
+      NFAIL_AU   = 0
+      NFAIL_SVDIFF = 0
+      NFAIL_TOTAL  = 0
+      NFAILQ_TOTAL = 0
+
+      DO LLOOP = 1, 4
+
+      WRITE(*,*) 'L Loop Index = ', LLOOP
+
+      ! Set the dimensions of the problem ...
+      READ(*,*) M
+      WRITE(*,*) 'M = ', M
+      ! ... and the number of snapshots.
+      READ(*,*) N
+      WRITE(*,*) 'N = ', N
+
+      ! Test the dimensions
+      IF ( ( MIN(M,N) == 0 ) .OR. ( M < N )  ) THEN
+          WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.'
+          STOP
+      END IF
+!.............
+      ! The seed inside the LLOOP so that each pass can be reproduced easily.
+      ISEED(1) = 4
+      ISEED(2) = 3
+      ISEED(3) = 2
+      ISEED(4) = 1
+
+      LDA  = M
+      LDF  = M
+      LDX  = M
+      LDY  = M
+      LDW  = N
+      LDZ  = M
+      LDAU = M
+      LDS  = N
+
+      TMP_XW  = ZERO
+      TMP_AU   = ZERO
+      TMP_REZ  = ZERO
+      TMP_REZQ = ZERO
+      SVDIFF   = ZERO
+      TMP_EX   = ZERO
+
+      ALLOCATE( A(LDA,M) )
+      ALLOCATE( AC(LDA,M) )
+      ALLOCATE( F(LDF,N+1) )
+      ALLOCATE( F0(LDF,N+1) )
+      ALLOCATE( F1(LDF,N+1) )
+      ALLOCATE( X(LDX,N) )
+      ALLOCATE( X0(LDX,N) )
+      ALLOCATE( Y(LDY,N+1) )
+      ALLOCATE( Y0(LDY,N+1) )
+      ALLOCATE( Y1(LDY,N+1) )
+      ALLOCATE( AU(LDAU,N) )
+      ALLOCATE( W(LDW,N) )
+      ALLOCATE( S(LDS,N) )
+      ALLOCATE( Z(LDZ,N) )
+      ALLOCATE( Z1(LDZ,N) )
+      ALLOCATE( RES(N) )
+      ALLOCATE( RES1(N) )
+      ALLOCATE( RESEX(N) )
+      ALLOCATE( CEIGS(N) )
+      ALLOCATE( SINGVX(N) )
+      ALLOCATE( SINGVQX(N) )
+
+      TOL  = 10*M*EPS
+      TOL2 = 10*M*N*EPS
+
+!.............
+
+      DO K_traj = 1, 2
+      !  Number of intial conditions in the simulation/trajectories (1 or 2)
+
+      COND   = 1.0D4
+      CMAX   = (1.0D1,1.0D1)
+      RSIGN  = 'F'
+      GRADE  = 'N'
+      MODEL  = 6
+      CONDL  = 1.0D1
+      MODER  = 6
+      CONDR  = 1.0D1
+      PIVTNG = 'N'
+      ! Loop over all parameter MODE values for CLATMR (+-1,..,+-6)
+
+      DO MODE = 1, 6
+
+      ALLOCATE( IWORK(2*M) )
+      ALLOCATE( CDA(M) )
+      ALLOCATE( CDL(M) )
+      ALLOCATE( CDR(M) )
+
+      CALL CLATMR( M, M, 'N', ISEED, 'N', CDA, MODE, COND, &
+                   CMAX, RSIGN, GRADE, CDL, MODEL,  CONDL, &
+                   CDR, MODER, CONDR, PIVTNG, IWORK, M, M, &
+                   ZERO, -ONE, 'N', A, LDA, IWORK(M+1), INFO )
+      DEALLOCATE( CDR )
+      DEALLOCATE( CDL )
+      DEALLOCATE( CDA )
+      DEALLOCATE( IWORK )
+
+      LCWORK = MAX(1,2*M)
+      ALLOCATE( CEIGSA(M) )
+      ALLOCATE( CWORK(LCWORK) )
+      ALLOCATE( WORK(2*M) )
+      AC(1:M,1:M) = A(1:M,1:M)
+      CALL CGEEV( 'N','N', M, AC, LDA, CEIGSA, CDUM2X2, 2, &
+                  CDUM2X2, 2, CWORK, LCWORK, WORK, INFO ) ! LAPACK CALL
+      DEALLOCATE(WORK)
+      DEALLOCATE(CWORK)
+
+      TMP = ABS(CEIGSA(ICAMAX(M, CEIGSA, 1))) ! The spectral radius of A
+      ! Scale the matrix A to have unit spectral radius.
+      CALL CLASCL( 'G',0, 0, TMP, ONE, M, M, &
+                   A, LDA, INFO )
+      CALL CLASCL( 'G',0, 0, TMP, ONE, M, 1, &
+                   CEIGSA, M, INFO )
+      ANORM = CLANGE( 'F', M, M, A, LDA, WDUMMY )
+
+      IF ( K_traj == 2 ) THEN
+          ! generate data as two trajectories
+          ! with two inital conditions
+          CALL CLARNV(2, ISEED, M, F(1,1) )
+          DO i = 1, N/2
+             CALL CGEMV( 'N', M, M, CONE, A, LDA, F(1,i), 1,  &
+                  CZERO, F(1,i+1), 1 )
+          END DO
+          X0(1:M,1:N/2) = F(1:M,1:N/2)
+          Y0(1:M,1:N/2) = F(1:M,2:N/2+1)
+
+          CALL CLARNV(2, ISEED, M, F(1,1) )
+          DO i = 1, N-N/2
+             CALL CGEMV( 'N', M, M, CONE, A, LDA, F(1,i), 1,  &
+                  CZERO, F(1,i+1), 1 )
+          END DO
+          X0(1:M,N/2+1:N) = F(1:M,1:N-N/2)
+          Y0(1:M,N/2+1:N) = F(1:M,2:N-N/2+1)
+      ELSE
+          CALL CLARNV(2, ISEED, M, F(1,1) )
+          DO i = 1, N
+             CALL CGEMV( 'N', M, M, CONE, A, M, F(1,i), 1,  &
+                  CZERO, F(1,i+1), 1 )
+          END DO
+          F0(1:M,1:N+1) = F(1:M,1:N+1)
+          X0(1:M,1:N) = F0(1:M,1:N)
+          Y0(1:M,1:N) = F0(1:M,2:N+1)
+      END IF
+
+      DEALLOCATE( CEIGSA )
+!........................................................................
+
+      DO iJOBZ = 1, 4
+
+          SELECT CASE ( iJOBZ )
+          CASE(1)
+              JOBZ   = 'V'
+              RESIDS = 'R'
+          CASE(2)
+              JOBZ   = 'V'
+              RESIDS = 'N'
+          CASE(3)
+              JOBZ   = 'F'
+              RESIDS = 'N'
+          CASE(4)
+              JOBZ   = 'N'
+              RESIDS = 'N'
+          END SELECT
+
+      DO iJOBREF = 1, 3
+
+          SELECT CASE ( iJOBREF )
+          CASE(1)
+              JOBREF = 'R'
+          CASE(2)
+              JOBREF = 'E'
+          CASE(3)
+              JOBREF = 'N'
+          END SELECT
+
+      DO iSCALE = 1, 4
+
+          SELECT CASE ( iSCALE )
+          CASE(1)
+              SCALE = 'S'
+          CASE(2)
+              SCALE = 'C'
+          CASE(3)
+              SCALE = 'Y'
+          CASE(4)
+              SCALE = 'N'
+          END SELECT
+
+      DO iNRNK = -1, -2, -1
+          NRNK   = iNRNK
+
+      DO iWHTSVD = 1,  3
+         ! Check all four options to compute the POD basis
+         ! via the SVD.
+         WHTSVD   = iWHTSVD
+
+      DO LWMINOPT = 1, 2
+         ! Workspace query for the minimal (1) and for the optimal
+         ! (2) workspace lengths determined by workspace query.
+
+      ! CGEDMD is always tested and its results are also used for
+      ! comparisons with CGEDMDQ.
+
+      X(1:M,1:N) = X0(1:M,1:N)
+      Y(1:M,1:N) = Y0(1:M,1:N)
+
+      CALL CGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD,  &
+                M,  N, X, LDX, Y, LDY, NRNK, TOL,  &
+                K, CEIGS, Z, LDZ,  RES,  &
+                AU, LDAU, W,  LDW,   S, LDS,        &
+                CDUMMY, -1, WDUMMY, -1, IDUMMY, -1, INFO )
+
+      IF ( (INFO .EQ. 2) .OR. ( INFO .EQ. 3 ) &
+                       .OR. ( INFO < 0 ) ) THEN
+        WRITE(*,*) 'Call to CGEDMD workspace query failed. &
+                   &Check the calling sequence and the code.'
+        WRITE(*,*) 'The error code is ', INFO
+        WRITE(*,*) 'The input parameters were ',      &
+        SCALE, JOBZ, RESIDS, JOBREF, WHTSVD,          &
+        M, N, LDX, LDY, NRNK, TOL, LDZ, LDAU, LDW, LDS
+        STOP
+      ELSE
+        !WRITE(*,*) '... done. Workspace length computed.'
+      END IF
+
+      LCWORK = INT(CDUMMY(LWMINOPT))
+      ALLOCATE(CWORK(LCWORK))
+      LIWORK = IDUMMY(1)
+      ALLOCATE(IWORK(LIWORK))
+      LWORK = INT(WDUMMY(1))
+      ALLOCATE(WORK(LWORK))
+
+      CALL CGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD,  &
+                   M,  N, X, LDX, Y, LDY, NRNK, TOL,  &
+                   K, CEIGS, Z, LDZ,  RES,  &
+                   AU, LDAU, W,  LDW,   S, LDS,        &
+                   CWORK, LCWORK, WORK, LWORK, IWORK, LIWORK, INFO )
+      IF ( INFO /= 0 ) THEN
+           WRITE(*,*) 'Call to CGEDMD failed. &
+           &Check the calling sequence and the code.'
+           WRITE(*,*) 'The error code is ', INFO
+           WRITE(*,*) 'The input parameters were ',&
+           SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+           M, N, LDX, LDY, NRNK, TOL
+           STOP
+      END IF
+      SINGVX(1:N) = WORK(1:N)
+
+      !...... CGEDMD check point
+      IF ( LSAME(JOBZ,'V')  ) THEN
+          ! Check that Z = X*W, on return from CGEDMD
+          ! This checks that the returned eigenvectors in Z are
+          ! the product of the SVD'POD basis returned in X
+          ! and the eigenvectors of the Rayleigh quotient
+          ! returned in W
+          CALL CGEMM( 'N', 'N', M, K, K, CONE, X, LDX, W, LDW, &
+                      CZERO, Z1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+             CALL CAXPY( M, -CONE, Z(1,i), 1, Z1(1,i), 1)
+             TMP = MAX(TMP, SCNRM2( M, Z1(1,i), 1 ) )
+          END DO
+          TMP_XW = MAX(TMP_XW, TMP )
+          IF ( TMP_XW <= TOL ) THEN
+              !WRITE(*,*) ' :) .... OK .........CGEDMD PASSED.'
+          ELSE
+              NFAIL_Z_XV = NFAIL_Z_XV + 1
+              WRITE(*,*) ':( .................CGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+      END IF
+      !...... CGEDMD check point
+
+      IF ( LSAME(JOBREF,'R') ) THEN
+           ! The matrix A*U is returned for computing refined Ritz vectors.
+           ! Check that A*U is computed correctly using the formula
+           ! A*U = Y * V * inv(SIGMA). This depends on the
+           ! accuracy in the computed singular values and vectors of X.
+           ! See the paper for an error analysis.
+           ! Note that the left singular vectors of the input matrix X
+           ! are returned in the array X.
+           CALL CGEMM( 'N', 'N', M, K, M, CONE, A, LDA, X, LDX, &
+                      CZERO, Z1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+             CALL CAXPY( M, -CONE, AU(1,i), 1, Z1(1,i), 1)
+             TMP = MAX( TMP, SCNRM2( M, Z1(1,i),1 ) * &
+                     SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_AU = MAX( TMP_AU, TMP )
+          IF ( TMP <= TOL2 ) THEN
+              !WRITE(*,*) ':) .... OK .........CGEDMD PASSED.'
+          ELSE
+              NFAIL_AU = NFAIL_AU + 1
+              WRITE(*,*) ':( .................CGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL2
+          END IF
+      ELSEIF ( LSAME(JOBREF,'E') ) THEN
+          ! The unscaled vectors of the Exact DMD are computed.
+          ! This option is included for the sake of completeness,
+          ! for users who prefer the Exact DMD vectors. The
+          ! returned vectors are in the real form, in the same way
+          ! as the Ritz vectors. Here we just save the vectors
+          ! and test them separately using a Matlab script.
+          CALL CGEMM( 'N', 'N', M, K, M, CONE, A, LDA, AU, LDAU, CZERO, Y1, LDY )
+
+          DO i=1, K
+             CALL CAXPY( M, -CEIGS(i), AU(1,i), 1, Y1(1,i), 1 )
+             RESEX(i) = SCNRM2( M, Y1(1,i), 1) / SCNRM2(M,AU(1,i),1)
+          END DO
+      END IF
+      !...... CGEDMD check point
+
+      IF ( LSAME(RESIDS, 'R') ) THEN
+          ! Compare the residuals returned by CGEDMD with the
+          ! explicitly computed residuals using the matrix A.
+          ! Compute explicitly Y1 = A*Z
+          CALL CGEMM( 'N', 'N', M, K, M, CONE, A, LDA, Z, LDZ, CZERO, Y1, LDY )
+          ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+          ! of the invariant subspaces that correspond to complex conjugate
+          ! pairs of eigencalues. (See the description of Z in CGEDMD,)
+
+          DO i=1, K
+                ! have a real eigenvalue with real eigenvector
+                CALL CAXPY( M, -CEIGS(i), Z(1,i), 1, Y1(1,i), 1 )
+                RES1(i) = SCNRM2( M, Y1(1,i), 1)
+          END DO
+          TMP = ZERO
+          DO i = 1, K
+          TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                    SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_REZ = MAX( TMP_REZ, TMP )
+          IF ( TMP <= TOL2 ) THEN
+              !WRITE(*,*) ':) .... OK ..........CGEDMD PASSED.'
+          ELSE
+              NFAIL_REZ = NFAIL_REZ + 1
+              WRITE(*,*) ':( ..................CGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+
+
+         IF ( LSAME(JOBREF,'E') ) THEN
+            TMP = ZERO
+          DO i = 1, K
+          TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) )
+          END DO
+          TMP_EX = MAX(TMP_EX,TMP)
+         END IF
+
+      END IF
+
+      DEALLOCATE(CWORK)
+      DEALLOCATE(WORK)
+      DEALLOCATE(IWORK)
+
+!.......................................................................................................
+
+      IF ( K_traj == 1 ) THEN
+
+          F(1:M,1:N+1) = F0(1:M,1:N+1)
+          CALL CGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, &
+                    WHTSVD, M, N+1, F, LDF,  X, LDX,  Y, LDY,  &
+                    NRNK,  TOL, K, CEIGS, Z, LDZ, RES,  AU,  &
+                    LDAU, W, LDW, S, LDS, CDUMMY, -1,   &
+                    WDUMMY,  -1, IDUMMY, -1, INFO )
+
+          LCWORK = INT(CDUMMY(LWMINOPT))
+          ALLOCATE(CWORK(LCWORK))
+          LIWORK = IDUMMY(1)
+          ALLOCATE(IWORK(LIWORK))
+          LWORK = INT(WDUMMY(1))
+          ALLOCATE(WORK(LWORK))
+
+          CALL CGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, &
+                        WHTSVD, M, N+1, F, LDF,  X, LDX,  Y, LDY,  &
+                        NRNK,  TOL, KQ, CEIGS, Z, LDZ, RES,  AU,  &
+                        LDAU, W, LDW, S, LDS, CWORK, LCWORK,   &
+                        WORK,  LWORK, IWORK, LIWORK, INFO )
+          IF ( INFO /= 0 ) THEN
+                 WRITE(*,*) 'Call to CGEDMDQ failed. &
+                 &Check the calling sequence and the code.'
+                 WRITE(*,*) 'The error code is ', INFO
+                 WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, WANTQ, WANTR, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+                 STOP
+          END IF
+          SINGVQX(1:N) =WORK(1:N)
+
+          !..... ZGEDMDQ check point
+
+          TMP = ZERO
+          DO i = 1, MIN(K, KQ)
+             TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / &
+                                   SINGVX(1) )
+          END DO
+          SVDIFF = MAX( SVDIFF, TMP )
+          IF ( TMP > TOL2 ) THEN
+               WRITE(*,*) 'FAILED! Something was wrong with the run.'
+             NFAIL_SVDIFF = NFAIL_SVDIFF + 1
+          END IF
+          !..... CGEDMDQ check point
+
+          !..... CGEDMDQ check point
+          IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN
+             ! Check that the QR factors are computed and returned
+             ! as requested. The residual ||F-Q*R||_F / ||F||_F
+             ! is compared to M*N*EPS.
+             F1(1:M,1:N+1) = F0(1:M,1:N+1)
+             CALL CGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -CONE, F, &
+                         LDF, Y, LDY, CONE, F1, LDF )
+             TMP_FQR = CLANGE( 'F', M, N+1, F1, LDF, WORK ) / &
+                   CLANGE( 'F', M, N+1, F0,  LDF, WORK )
+             IF ( TMP_FQR <= TOL2 ) THEN
+                !WRITE(*,*) ':) CGEDMDQ ........ PASSED.'
+             ELSE
+                WRITE(*,*) ':( CGEDMDQ ........ FAILED.'
+                NFAIL_F_QR = NFAIL_F_QR + 1
+             END IF
+          END IF
+          !..... ZGEDMDQ checkpoint
+                 !..... ZGEDMDQ checkpoint
+          IF ( LSAME(RESIDS, 'R') ) THEN
+              ! Compare the residuals returned by ZGEDMDQ with the
+              ! explicitly computed residuals using the matrix A.
+              ! Compute explicitly Y1 = A*Z
+              CALL CGEMM( 'N', 'N', M, KQ, M, CONE, A, LDA, Z, LDZ, CZERO, Y1, LDY )
+              ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+              ! of the invariant subspaces that correspond to complex conjugate
+              ! pairs of eigencalues. (See the description of Z in ZGEDMDQ)
+              DO i = 1, KQ
+                    ! have a real eigenvalue with real eigenvector
+                    CALL CAXPY( M, -CEIGS(i), Z(1,i), 1, Y1(1,i), 1 )
+                    ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i)
+                    RES1(i) = SCNRM2( M, Y1(1,i), 1)
+              END DO
+              TMP = ZERO
+              DO i = 1, KQ
+              TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                  SINGVQX(KQ)/(ANORM*SINGVQX(1)) )
+              END DO
+              TMP_REZQ = MAX( TMP_REZQ, TMP )
+              IF ( TMP <= TOL2 ) THEN
+                  !WRITE(*,*) '.... OK ........ CGEDMDQ PASSED.'
+              ELSE
+                  NFAIL_REZQ = NFAIL_REZQ + 1
+                  WRITE(*,*) '................ CGEDMDQ FAILED!', &
+                      'Check the code for implementation errors.'
+              END IF
+          END IF
+
+          DEALLOCATE(CWORK)
+          DEALLOCATE(WORK)
+          DEALLOCATE(IWORK)
+
+      END IF
+
+      END DO   ! LWMINOPT
+      !write(*,*) 'LWMINOPT loop completed'
+      END DO   ! iWHTSVD
+      !write(*,*) 'WHTSVD loop completed'
+      END DO   ! iNRNK  -2:-1
+      !write(*,*) 'NRNK loop completed'
+      END DO   ! iSCALE  1:4
+      !write(*,*) 'SCALE loop completed'
+      END DO
+      !write(*,*) 'JOBREF loop completed'
+      END DO   ! iJOBZ
+      !write(*,*) 'JOBZ loop completed'
+
+      END DO ! MODE -6:6
+      !write(*,*) 'MODE loop completed'
+      END DO ! 1 or 2 trajectories
+      !write(*,*) 'trajectories  loop completed'
+
+      DEALLOCATE( A )
+      DEALLOCATE( AC )
+      DEALLOCATE( Z )
+      DEALLOCATE( F )
+      DEALLOCATE( F0 )
+      DEALLOCATE( F1 )
+      DEALLOCATE( X )
+      DEALLOCATE( X0 )
+      DEALLOCATE( Y )
+      DEALLOCATE( Y0 )
+      DEALLOCATE( Y1 )
+      DEALLOCATE( AU )
+      DEALLOCATE( W )
+      DEALLOCATE( S )
+      DEALLOCATE( Z1 )
+      DEALLOCATE( RES )
+      DEALLOCATE( RES1 )
+      DEALLOCATE( RESEX )
+      DEALLOCATE( CEIGS )
+      DEALLOCATE( SINGVX )
+      DEALLOCATE( SINGVQX )
+
+      END DO ! LLOOP
+
+      WRITE(*,*)
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for CGEDMD :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+      IF ( NFAIL_Z_XV == 0 ) THEN
+          WRITE(*,*) '>>>> Z - U*V test PASSED.'
+      ELSE
+          WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)'
+          WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_XW
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_z_XV
+      END IF
+
+      IF ( NFAIL_AU == 0 ) THEN
+          WRITE(*,*) '>>>> A*U test PASSED. '
+      ELSE
+          WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)'
+          WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU
+      END IF
+
+
+      IF ( NFAIL_REZ == 0 ) THEN
+         WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+        WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)'
+        WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ
+        WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+        NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ
+      END IF
+      IF ( NFAIL_TOTAL == 0 ) THEN
+        WRITE(*,*) '>>>> CGEDMD :: ALL TESTS PASSED.'
+      ELSE
+        WRITE(*,*) NFAIL_TOTAL, 'FAILURES!'
+        WRITE(*,*) '>>>>>>>>>>>>>> CGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      WRITE(*,*)
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for CGEDMDQ :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+
+      IF ( NFAIL_SVDIFF == 0 ) THEN
+        WRITE(*,*) '>>>> CGEDMD and CGEDMDQ computed singular &
+           &values test PASSED.'
+      ELSE
+        WRITE(*,*) 'ZGEDMD and ZGEDMDQ discrepancies in &
+            &the singular values unacceptable ', &
+            NFAIL_SVDIFF, ' times. Test FAILED.'
+        WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF
+        WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+        NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF
+      END IF
+      IF ( NFAIL_F_QR == 0 ) THEN
+        WRITE(*,*) '>>>> F - Q*R test PASSED.'
+      ELSE
+        WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)'
+        WRITE(*,*) 'The largest relative residual was ', TMP_FQR
+        WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+        NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR
+      END IF
+
+      IF ( NFAIL_REZQ == 0 ) THEN
+        WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+        WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)'
+        WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ
+        WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+        NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ
+      END IF
+
+      IF ( NFAILQ_TOTAL == 0 ) THEN
+        WRITE(*,*) '>>>>>>> CGEDMDQ :: ALL TESTS PASSED.'
+      ELSE
+        WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!'
+        WRITE(*,*) '>>>>>>> CGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      WRITE(*,*)
+      WRITE(*,*) 'Test completed.'
+      STOP
+      END
diff --git a/lapack-netlib/TESTING/EIG/cerrst.f b/lapack-netlib/TESTING/EIG/cerrst.f
index 1748a2aad..d23eb14ea 100644
--- a/lapack-netlib/TESTING/EIG/cerrst.f
+++ b/lapack-netlib/TESTING/EIG/cerrst.f
@@ -748,17 +748,17 @@
          CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK )
          INFOT = 18
          CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ),
+     $                M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N-1, IW( 2*N-1 ),
+     $                M, R, Z, 1, IW, Q, 2*N, RW, 0, IW( 2*N-1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK )
          INFOT = 22
          CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 10*N-1,
+     $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 0,
      $                INFO )
          CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK )
          NT = NT + 12
@@ -830,19 +830,19 @@
          INFOT = 18
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
      $                0.0, 0.0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ),
+     $                M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
      $                0.0, 0.0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ),
+     $                M, R, Z, 1, IW, Q, 26*N, RW, 0, IW( 2*N-1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 22
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
      $                0.0, 0.0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1,
+     $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 0,
      $                INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          NT = NT + 13
diff --git a/lapack-netlib/TESTING/EIG/chkxer.f b/lapack-netlib/TESTING/EIG/chkxer.f
index fd00bb65a..70caf7e0a 100644
--- a/lapack-netlib/TESTING/EIG/chkxer.f
+++ b/lapack-netlib/TESTING/EIG/chkxer.f
@@ -61,7 +61,7 @@
       RETURN
 *
  9999 FORMAT( ' *** Illegal value of parameter number ', I2,
-     $      ' not detected by ', A6, ' ***' )
+     $      ' not detected by ', A, ' ***' )
 *
 *     End of CHKXER
 *
diff --git a/lapack-netlib/TESTING/EIG/dchkdmd.f90 b/lapack-netlib/TESTING/EIG/dchkdmd.f90
new file mode 100644
index 000000000..4fbf7531b
--- /dev/null
+++ b/lapack-netlib/TESTING/EIG/dchkdmd.f90
@@ -0,0 +1,813 @@
+!     This is a test program for checking the implementations of
+!     the implementations of the following subroutines
+!
+!     DGEDMD  for computation of the
+!             Dynamic Mode Decomposition (DMD)
+!     DGEDMDQ for computation of a
+!             QR factorization based compressed DMD
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     ========================================================
+!     How to run the code (compiler, link info)
+!     ========================================================
+!     Compile as FORTRAN 90 (or later) and link with BLAS and
+!     LAPACK libraries.
+!     NOTE: The code is developed and tested on top of the
+!     Intel MKL library (versions 2022.0.3 and 2022.2.0),
+!     using the Intel Fortran compiler.
+!
+!     For developers of the C++ implementation
+!     ========================================================
+!     See the LAPACK++ and Template Numerical Toolkit (TNT)
+!
+!     Note on a development of the GPU HP implementation
+!     ========================================================
+!     Work in progress. See CUDA, MAGMA, SLATE.
+!     NOTE: The four SVD subroutines used in this code are
+!     included as a part of R&D and for the completeness.
+!     This was also an opportunity to test those SVD codes.
+!     If the scaling option is used all four are essentially
+!     equally good. For implementations on HP platforms,
+!     one can use whichever SVD is available.
+!...  .........................................................
+!     NOTE:
+!     When using the Intel MKL 2022.0.3 the subroutine xGESVDQ
+!     (optionally used in xGEDMD) may cause access violation
+!     error for x = S, D, C, Z, but only if called with the
+!     work space query. (At least in our Windows 10 MSVS 2019.)
+!     The problem can be mitigated by downloading the source
+!     code of xGESVDQ from the LAPACK repository and use it
+!     localy instead of the one in the MKL. This seems to
+!     indicate that the problem is indeed in the MKL.
+!     This problem did not appear whith Intel MKL 2022.2.0.
+!
+!     NOTE:
+!     xGESDD seems to have a problem with workspace. In some
+!     cases the length of the optimal workspace is returned
+!     smaller than the minimal workspace, as specified in the
+!     code. As a precaution, all optimal workspaces are
+!     set as MAX(minimal, optimal).
+!     Latest implementations of complex xGESDD have different
+!     length of the real worksapce. We use max value over
+!     two versions.
+!............................................................
+!............................................................
+!
+      PROGRAM DMD_TEST
+      use iso_fortran_env, only: real64
+      IMPLICIT NONE
+      integer, parameter :: WP = real64
+
+!............................................................
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!............................................................
+      REAL(KIND=WP), ALLOCATABLE, DIMENSION(:,:) ::          &
+                     A, AC, EIGA, LAMBDA, LAMBDAQ, F, F1, F2,&
+                     Z, Z1, S, AU, W, VA, X, X0, Y, Y0, Y1
+      REAL(KIND=WP), ALLOCATABLE, DIMENSION(:)   ::          &
+                     DA, DL, DR, REIG, REIGA, REIGQ, IEIG,   &
+                     IEIGA, IEIGQ,  RES, RES1, RESEX, SINGVX,&
+                     SINGVQX, WORK
+      INTEGER      , ALLOCATABLE, DIMENSION(:)   ::   IWORK
+      REAL(KIND=WP) :: AB(2,2),   WDUMMY(2)
+      INTEGER       :: IDUMMY(2), ISEED(4), RJOBDATA(8)
+      REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, DMAX, EPS, &
+                       TOL, TOL2, SVDIFF, TMP, TMP_AU,       &
+                       TMP_FQR, TMP_REZ, TMP_REZQ,  TMP_ZXW, &
+                       TMP_EX, XNORM, YNORM
+!............................................................
+      INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY,  &
+                 LDZ, LIWORK, LWORK, M, N, L, LLOOP, NRNK
+      INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, j, KDIFF,  &
+                 NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ,     &
+                 NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL, &
+                 NFAIL_Z_XV, MODE, MODEL, MODER, WHTSVD
+      INTEGER    iNRNK, iWHTSVD, K_TRAJ, LWMINOPT
+      CHARACTER(LEN=1) GRADE, JOBREF, JOBZ, PIVTNG, RSIGN,   &
+                       SCALE, RESIDS, WANTQ, WANTR
+
+      LOGICAL  TEST_QRDMD
+!..... external subroutines (BLAS and LAPACK)
+      EXTERNAL DAXPY,  DGEEV, DGEMM, DGEMV, DLACPY, DLASCL
+      EXTERNAL DLARNV, DLATMR
+!.....external subroutines DMD package, part 1
+!     subroutines under test
+      EXTERNAL DGEDMD, DGEDMDQ
+
+!..... external functions (BLAS and LAPACK)
+      EXTERNAL         DLAMCH, DLANGE, DNRM2
+      REAL(KIND=WP) :: DLAMCH, DLANGE, DNRM2
+      EXTERNAL         LSAME
+      LOGICAL          LSAME
+
+      INTRINSIC ABS, INT, MIN, MAX
+!............................................................
+
+      ! The test is always in pairs : ( DGEDMD and DGEDMDQ )
+      ! because the test includes comparing the results (in pairs).
+!.....................................................................................
+      TEST_QRDMD = .TRUE. ! This code by default performs tests on DGEDMDQ
+                          ! Since the QR factorizations based algorithm is designed for
+                          ! single trajectory data, only single trajectory tests will
+                          ! be performed with xGEDMDQ.
+      WANTQ = 'Q'
+      WANTR = 'R'
+!.................................................................................
+
+      EPS = DLAMCH( 'P' )  ! machine precision DP
+
+      ! Global counters of failures of some particular tests
+      NFAIL      = 0
+      NFAIL_REZ  = 0
+      NFAIL_REZQ = 0
+      NFAIL_Z_XV = 0
+      NFAIL_F_QR = 0
+      NFAIL_AU   = 0
+      KDIFF      = 0
+      NFAIL_SVDIFF = 0
+      NFAIL_TOTAL  = 0
+      NFAILQ_TOTAL = 0
+
+
+      DO LLOOP = 1, 4
+
+      WRITE(*,*) 'L Loop Index = ', LLOOP
+
+      ! Set the dimensions of the problem ...
+      WRITE(*,*) 'M = '
+      READ(*,*) M
+      WRITE(*,*) M
+      ! ... and the number of snapshots.
+      WRITE(*,*) 'N = '
+      READ(*,*) N
+      WRITE(*,*) N
+
+      ! ... Test the dimensions
+      IF ( ( MIN(M,N) == 0 ) .OR. ( M < N )  ) THEN
+          WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.'
+          STOP
+      END IF
+!.............
+      ! The seed inside the LLOOP so that each pass can be reproduced easily.
+
+      ISEED(1) = 4
+      ISEED(2) = 3
+      ISEED(3) = 2
+      ISEED(4) = 1
+
+      LDA  = M
+      LDF  = M
+      LDX  = MAX(M,N+1)
+      LDY  = MAX(M,N+1)
+      LDW  = N
+      LDZ  = M
+      LDAU = MAX(M,N+1)
+      LDS  = N
+
+      TMP_ZXW  = ZERO
+      TMP_AU   = ZERO
+      TMP_REZ  = ZERO
+      TMP_REZQ = ZERO
+      SVDIFF   = ZERO
+      TMP_EX   = ZERO
+
+      !
+      ! Test the subroutines on real data snapshots. All
+      ! computation is done in real arithmetic, even when
+      ! Koopman eigenvalues and modes are real.
+      !
+      ! Allocate memory space
+      ALLOCATE( A(LDA,M) )
+      ALLOCATE( AC(LDA,M) )
+      ALLOCATE( DA(M) )
+      ALLOCATE( DL(M) )
+      ALLOCATE( F(LDF,N+1) )
+      ALLOCATE( F1(LDF,N+1) )
+      ALLOCATE( F2(LDF,N+1) )
+      ALLOCATE( X(LDX,N) )
+      ALLOCATE( X0(LDX,N) )
+      ALLOCATE( SINGVX(N) )
+      ALLOCATE( SINGVQX(N) )
+      ALLOCATE( Y(LDY,N+1) )
+      ALLOCATE( Y0(LDY,N+1) )
+      ALLOCATE( Y1(M,N+1) )
+      ALLOCATE( Z(LDZ,N) )
+      ALLOCATE( Z1(LDZ,N) )
+      ALLOCATE( RES(N)  )
+      ALLOCATE( RES1(N) )
+      ALLOCATE( RESEX(N) )
+      ALLOCATE( REIG(N) )
+      ALLOCATE( IEIG(N) )
+      ALLOCATE( REIGQ(N) )
+      ALLOCATE( IEIGQ(N) )
+      ALLOCATE( REIGA(M) )
+      ALLOCATE( IEIGA(M) )
+      ALLOCATE( VA(LDA,M) )
+      ALLOCATE( LAMBDA(N,2) )
+      ALLOCATE( LAMBDAQ(N,2) )
+      ALLOCATE( EIGA(M,2) )
+      ALLOCATE( W(LDW,N) )
+      ALLOCATE( AU(LDAU,N) )
+      ALLOCATE( S(N,N) )
+
+      TOL  = M*EPS
+      ! This mimics O(M*N)*EPS bound for accumulated roundoff error.
+      ! The factor 10 is somewhat arbitrary.
+      TOL2 = 10*M*N*EPS
+
+!.............
+
+      DO K_TRAJ = 1, 2
+      !  Number of intial conditions in the simulation/trajectories (1 or 2)
+
+      COND = 1.0D8
+      DMAX = 1.0D2
+      RSIGN = 'F'
+      GRADE = 'N'
+      MODEL = 6
+      CONDL = 1.0D2
+      MODER = 6
+      CONDR = 1.0D2
+      PIVTNG = 'N'
+
+      ! Loop over all parameter MODE values for ZLATMR (+1,..,+6)
+      DO MODE = 1, 6
+
+      ALLOCATE( IWORK(2*M) )
+      ALLOCATE(DR(N))
+      CALL DLATMR( M, M, 'S', ISEED, 'N', DA, MODE, COND, &
+                   DMAX, RSIGN, GRADE, DL, MODEL,  CONDL, &
+                   DR, MODER, CONDR, PIVTNG, IWORK, M, M, &
+                   ZERO, -ONE, 'N', A, LDA, IWORK(M+1), INFO )
+      DEALLOCATE(IWORK)
+      DEALLOCATE(DR)
+
+      LWORK = 4*M+1
+      ALLOCATE(WORK(LWORK))
+      AC  = A
+      CALL DGEEV( 'N','V', M, AC, M, REIGA, IEIGA, VA, M, &
+                  VA, M, WORK, LWORK, INFO ) ! LAPACK CALL
+      DEALLOCATE(WORK)
+      TMP = ZERO
+      DO i = 1, M
+         EIGA(i,1) = REIGA(i)
+         EIGA(i,2) = IEIGA(i)
+         TMP = MAX( TMP, SQRT(REIGA(i)**2+IEIGA(i)**2))
+      END DO
+
+      ! Scale A to have the desirable spectral radius.
+      CALL DLASCL( 'G', 0, 0, TMP, ONE, M, M, A, M, INFO )
+      CALL DLASCL( 'G', 0, 0, TMP, ONE, M, 2, EIGA, M, INFO )
+
+      ! Compute the norm of A
+      ANORM = DLANGE( 'F', N, N, A, M, WDUMMY )
+
+      IF ( K_TRAJ == 2 ) THEN
+          ! generate data with two inital conditions
+      CALL DLARNV(2, ISEED, M, F1(1,1) )
+      F1(1:M,1) = 1.0E-10*F1(1:M,1)
+      DO i = 1, N/2
+         CALL DGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, &
+              F1(1,i+1), 1 )
+      END DO
+      X0(1:M,1:N/2) = F1(1:M,1:N/2)
+      Y0(1:M,1:N/2) = F1(1:M,2:N/2+1)
+
+      CALL DLARNV(2, ISEED, M, F1(1,1) )
+      DO i = 1, N-N/2
+         CALL DGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, &
+              F1(1,i+1), 1 )
+      END DO
+      X0(1:M,N/2+1:N) = F1(1:M,1:N-N/2)
+      Y0(1:M,N/2+1:N) = F1(1:M,2:N-N/2+1)
+      ELSE
+      CALL DLARNV(2, ISEED, M, F(1,1) )
+      DO i = 1, N
+         CALL DGEMV( 'N', M, M, ONE, A, M, F(1,i), 1, ZERO, &
+              F(1,i+1), 1 )
+      END DO
+      X0(1:M,1:N) = F(1:M,1:N)
+      Y0(1:M,1:N) = F(1:M,2:N+1)
+      END IF
+
+      XNORM = DLANGE( 'F', M, N, X0, LDX, WDUMMY )
+      YNORM = DLANGE( 'F', M, N, Y0, LDX, WDUMMY )
+!............................................................
+
+      DO iJOBZ = 1, 4
+
+          SELECT CASE ( iJOBZ )
+          CASE(1)
+              JOBZ   = 'V' ! Ritz vectors will be computed
+              RESIDS = 'R' ! Residuals will be computed
+          CASE(2)
+              JOBZ   = 'V'
+              RESIDS = 'N'
+          CASE(3)
+              JOBZ   = 'F' ! Ritz vectors in factored form
+              RESIDS = 'N'
+          CASE(4)
+              JOBZ   = 'N'
+              RESIDS = 'N'
+          END SELECT
+
+      DO iJOBREF = 1, 3
+
+          SELECT CASE ( iJOBREF )
+          CASE(1)
+              JOBREF = 'R' ! Data for refined Ritz vectors
+          CASE(2)
+              JOBREF = 'E' ! Exact DMD vectors
+          CASE(3)
+              JOBREF = 'N'
+          END SELECT
+
+      DO iSCALE = 1, 4
+
+          SELECT CASE ( iSCALE )
+          CASE(1)
+              SCALE = 'S' ! X data normalized
+          CASE(2)
+              SCALE = 'C' ! X normalized, consist. check
+          CASE(3)
+              SCALE = 'Y' ! Y data normalized
+          CASE(4)
+              SCALE = 'N'
+          END SELECT
+
+      DO iNRNK = -1, -2, -1
+          ! Two truncation strategies. The "-2" case for R&D
+          ! purposes only - it uses possibly low accuracy small
+          ! singular values, in which case the formulas used in
+          ! the DMD are highly sensitive.
+          NRNK   = iNRNK
+
+      DO iWHTSVD = 1, 4
+          ! Check all four options to compute the POD basis
+          ! via the SVD.
+          WHTSVD   = iWHTSVD
+
+      DO LWMINOPT = 1, 2
+          ! Workspace query for the minimal (1) and for the optimal
+          ! (2) workspace lengths determined by workspace query.
+
+      X(1:M,1:N) = X0(1:M,1:N)
+      Y(1:M,1:N) = Y0(1:M,1:N)
+
+      ! DGEDMD: Workspace query and workspace allocation
+      CALL DGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, &
+           N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, &
+           LDZ, RES, AU, LDAU, W, LDW, S, LDS, WDUMMY, -1, &
+           IDUMMY, -1, INFO )
+
+      LIWORK = IDUMMY(1)
+      ALLOCATE( IWORK(LIWORK) )
+      LWORK = INT(WDUMMY(LWMINOPT))
+      ALLOCATE( WORK(LWORK) )
+
+      ! DGEDMD test: CALL DGEDMD
+      CALL DGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, &
+           N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, &
+           LDZ, RES, AU, LDAU, W, LDW, S, LDS, WORK, LWORK,&
+           IWORK, LIWORK, INFO )
+
+      SINGVX(1:N) = WORK(1:N)
+
+      !...... DGEDMD check point
+      IF ( LSAME(JOBZ,'V')  ) THEN
+          ! Check that Z = X*W, on return from DGEDMD
+          ! This checks that the returned aigenvectors in Z are
+          ! the product of the SVD'POD basis returned in X
+          ! and the eigenvectors of the rayleigh quotient
+          ! returned in W
+          CALL DGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, &
+                      ZERO, Z1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+             CALL DAXPY( M, -ONE, Z(1,i), 1, Z1(1,i), 1)
+             TMP = MAX(TMP, DNRM2( M, Z1(1,i), 1 ) )
+          END DO
+          TMP_ZXW = MAX(TMP_ZXW, TMP )
+
+          IF ( TMP_ZXW > 10*M*EPS ) THEN
+              NFAIL_Z_XV = NFAIL_Z_XV + 1
+              WRITE(*,*) ':( .................DGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+
+      END IF
+
+      !...... DGEDMD check point
+      IF ( LSAME(JOBREF,'R') ) THEN
+          ! The matrix A*U is returned for computing refined Ritz vectors.
+          ! Check that A*U is computed correctly using the formula
+          ! A*U = Y * V * inv(SIGMA). This depends on the
+          ! accuracy in the computed singular values and vectors of X.
+          ! See the paper for an error analysis.
+          ! Note that the left singular vectors of the input matrix X
+          ! are returned in the array X.
+          CALL DGEMM( 'N', 'N', M, K, M, ONE, A, LDA, X, LDX, &
+                     ZERO, Z1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+              CALL DAXPY( M, -ONE, AU(1,i), 1, Z1(1,i), 1)
+              TMP = MAX( TMP, DNRM2( M, Z1(1,i),1 ) * &
+                       SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_AU = MAX( TMP_AU, TMP )
+
+          IF ( TMP > TOL2 ) THEN
+              NFAIL_AU = NFAIL_AU + 1
+              WRITE(*,*) ':( .................DGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+
+      ELSEIF ( LSAME(JOBREF,'E') ) THEN
+      ! The unscaled vectors of the Exact DMD are computed.
+      ! This option is included for the sake of completeness,
+      ! for users who prefer the Exact DMD vectors. The
+      ! returned vectors are in the real form, in the same way
+      ! as the Ritz vectors. Here we just save the vectors
+      ! and test them separately using a Matlab script.
+
+       CALL DGEMM( 'N', 'N', M, K, M, ONE, A, LDA, AU, LDAU, ZERO, Y1, M )
+       i=1
+       DO WHILE ( i <= K )
+           IF ( IEIG(i) == ZERO ) THEN
+           ! have a real eigenvalue with real eigenvector
+           CALL DAXPY( M, -REIG(i), AU(1,i), 1, Y1(1,i), 1 )
+           RESEX(i) = DNRM2( M, Y1(1,i), 1) / DNRM2(M,AU(1,i),1)
+           i = i + 1
+           ELSE
+           ! Have a complex conjugate pair
+           ! REIG(i) +- sqrt(-1)*IMEIG(i).
+           ! Since all computation is done in real
+           ! arithmetic, the formula for the residual
+           ! is recast for real representation of the
+           ! complex conjugate eigenpair. See the
+           ! description of RES.
+           AB(1,1) =  REIG(i)
+           AB(2,1) = -IEIG(i)
+           AB(1,2) =  IEIG(i)
+           AB(2,2) =  REIG(i)
+           CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, AU(1,i), &
+                       M, AB, 2, ONE, Y1(1,i), M )
+           RESEX(i)   = DLANGE( 'F', M, 2, Y1(1,i), M, &
+                        WORK )/ DLANGE( 'F', M, 2, AU(1,i), M, &
+                        WORK )
+           RESEX(i+1) = RESEX(i)
+           i = i + 2
+           END IF
+       END DO
+
+      END IF
+
+      !...... DGEDMD check point
+      IF ( LSAME(RESIDS, 'R') ) THEN
+          ! Compare the residuals returned by DGEDMD with the
+          ! explicitly computed residuals using the matrix A.
+          ! Compute explicitly Y1 = A*Z
+          CALL DGEMM( 'N', 'N', M, K, M, ONE, A, LDA, Z, LDZ, ZERO, Y1, M )
+          ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+          ! of the invariant subspaces that correspond to complex conjugate
+          ! pairs of eigencalues. (See the description of Z in DGEDMD,)
+          i = 1
+          DO WHILE ( i <= K )
+              IF ( IEIG(i) == ZERO ) THEN
+                  ! have a real eigenvalue with real eigenvector
+                  CALL DAXPY( M, -REIG(i), Z(1,i), 1, Y1(1,i), 1 )
+                  RES1(i) = DNRM2( M, Y1(1,i), 1)
+                  i = i + 1
+              ELSE
+                  ! Have a complex conjugate pair
+                  ! REIG(i) +- sqrt(-1)*IMEIG(i).
+                  ! Since all computation is done in real
+                  ! arithmetic, the formula for the residual
+                  ! is recast for real representation of the
+                  ! complex conjugate eigenpair. See the
+                  ! description of RES.
+                  AB(1,1) =  REIG(i)
+                  AB(2,1) = -IEIG(i)
+                  AB(1,2) =  IEIG(i)
+                  AB(2,2) =  REIG(i)
+                  CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), &
+                              M, AB, 2, ONE, Y1(1,i), M )
+                  RES1(i)   = DLANGE( 'F', M, 2, Y1(1,i), M, &
+                                     WORK )
+                  RES1(i+1) = RES1(i)
+                  i = i + 2
+              END IF
+          END DO
+          TMP = ZERO
+          DO i = 1, K
+              TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                        SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_REZ = MAX( TMP_REZ, TMP )
+
+          IF ( TMP > TOL2 ) THEN
+              NFAIL_REZ = NFAIL_REZ + 1
+              WRITE(*,*) ':( ..................DGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+
+          IF ( LSAME(JOBREF,'E') ) THEN
+              TMP = ZERO
+              DO i = 1, K
+                  TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) )
+              END DO
+              TMP_EX = MAX(TMP_EX,TMP)
+          END IF
+
+      END IF
+
+      !..... store the results for inspection
+      DO i = 1, K
+          LAMBDA(i,1) = REIG(i)
+          LAMBDA(i,2) = IEIG(i)
+      END DO
+
+      DEALLOCATE(IWORK)
+      DEALLOCATE(WORK)
+
+      !======================================================================
+      !     Now test the DGEDMDQ
+      !======================================================================
+      IF ( TEST_QRDMD .AND. (K_TRAJ == 1) ) THEN
+          RJOBDATA(2) = 1
+          F1 = F
+
+          ! DGEDMDQ test: Workspace query and workspace allocation
+          CALL DGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, &
+               JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, &
+               LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ,   &
+               RES, AU, LDAU, W, LDW, S, LDS, WDUMMY,      &
+               -1, IDUMMY, -1, INFO )
+          LIWORK = IDUMMY(1)
+          ALLOCATE( IWORK(LIWORK) )
+          LWORK = INT(WDUMMY(LWMINOPT))
+          ALLOCATE(WORK(LWORK))
+          ! DGEDMDQ test: CALL DGEDMDQ
+          CALL DGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, &
+               JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, &
+               LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ,   &
+               RES, AU, LDAU, W, LDW, S, LDS,              &
+               WORK, LWORK, IWORK, LIWORK, INFO )
+
+          SINGVQX(1:KQ) = WORK(MIN(M,N+1)+1: MIN(M,N+1)+KQ)
+
+          !..... DGEDMDQ check point
+          IF ( KQ /= K ) THEN
+              KDIFF = KDIFF+1
+          END IF
+
+          TMP = ZERO
+          DO i = 1, MIN(K, KQ)
+              TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / &
+                                    SINGVX(1) )
+          END DO
+          SVDIFF = MAX( SVDIFF, TMP )
+          IF ( TMP > M*N*EPS ) THEN
+              WRITE(*,*) 'FAILED! Something was wrong with the run.'
+              NFAIL_SVDIFF = NFAIL_SVDIFF + 1
+              DO j =1, 3
+                  write(*,*) j, SINGVX(j), SINGVQX(j)
+                  read(*,*)
+              END DO
+          END IF
+
+          !..... DGEDMDQ check point
+          IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN
+              ! Check that the QR factors are computed and returned
+              ! as requested. The residual ||F-Q*R||_F / ||F||_F
+              ! is compared to M*N*EPS.
+              F2 = F
+              CALL DGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -ONE, F1, &
+                          LDF, Y, LDY, ONE, F2, LDF )
+              TMP_FQR = DLANGE( 'F', M, N+1, F2, LDF, WORK ) / &
+                    DLANGE( 'F', M, N+1, F,  LDF, WORK )
+              IF ( TMP_FQR > TOL2 ) THEN
+                  WRITE(*,*) 'FAILED! Something was wrong with the run.'
+                  NFAIL_F_QR = NFAIL_F_QR + 1
+              END IF
+          END IF
+
+          !..... DGEDMDQ check point
+          IF ( LSAME(RESIDS, 'R') ) THEN
+              ! Compare the residuals returned by DGEDMDQ with the
+              ! explicitly computed residuals using the matrix A.
+              ! Compute explicitly Y1 = A*Z
+              CALL DGEMM( 'N', 'N', M, KQ, M, ONE, A, M, Z, M, ZERO, Y1, M )
+              ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+              ! of the invariant subspaces that correspond to complex conjugate
+              ! pairs of eigencalues. (See the description of Z in DGEDMDQ)
+              i = 1
+              DO WHILE ( i <= KQ )
+                  IF ( IEIGQ(i) == ZERO ) THEN
+                      ! have a real eigenvalue with real eigenvector
+                      CALL DAXPY( M, -REIGQ(i), Z(1,i), 1, Y1(1,i), 1 )
+                      ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i)
+                      RES1(i) = DNRM2( M, Y1(1,i), 1)
+                      i = i + 1
+                  ELSE
+                     ! Have a complex conjugate pair
+                     ! REIG(i) +- sqrt(-1)*IMEIG(i).
+                     ! Since all computation is done in real
+                     ! arithmetic, the formula for the residual
+                     ! is recast for real representation of the
+                     ! complex conjugate eigenpair. See the
+                     ! description of RES.
+                     AB(1,1) =  REIGQ(i)
+                     AB(2,1) = -IEIGQ(i)
+                     AB(1,2) =  IEIGQ(i)
+                     AB(2,2) =  REIGQ(i)
+                     CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), &
+                                 M, AB, 2, ONE, Y1(1,i), M )             ! BLAS CALL
+                     ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB   ! INTRINSIC
+                     RES1(i)   = DLANGE( 'F', M, 2, Y1(1,i), M, &
+                                        WORK )                           ! LAPACK CALL
+                     RES1(i+1) = RES1(i)
+                     i = i + 2
+                  END IF
+              END DO
+              TMP = ZERO
+              DO i = 1, KQ
+                  TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                      SINGVQX(K)/(ANORM*SINGVQX(1)) )
+              END DO
+              TMP_REZQ = MAX( TMP_REZQ, TMP )
+              IF ( TMP > TOL2 ) THEN
+                  NFAIL_REZQ = NFAIL_REZQ + 1
+                  WRITE(*,*) '................ DGEDMDQ FAILED!', &
+                      'Check the code for implementation errors.'
+                  STOP
+              END IF
+
+          END IF
+
+          DO i = 1, KQ
+              LAMBDAQ(i,1) = REIGQ(i)
+              LAMBDAQ(i,2) = IEIGQ(i)
+          END DO
+
+          DEALLOCATE(WORK)
+          DEALLOCATE(IWORK)
+      END IF ! TEST_QRDMD
+!======================================================================
+
+      END DO ! LWMINOPT
+      !write(*,*) 'LWMINOPT loop completed'
+      END DO ! WHTSVD LOOP
+      !write(*,*) 'WHTSVD loop completed'
+      END DO ! NRNK LOOP
+      !write(*,*) 'NRNK loop completed'
+      END DO ! SCALE LOOP
+      !write(*,*) 'SCALE loop completed'
+      END DO ! JOBF LOOP
+      !write(*,*) 'JOBREF loop completed'
+      END DO ! JOBZ LOOP
+      !write(*,*) 'JOBZ loop completed'
+
+      END DO ! MODE -6:6
+      !write(*,*) 'MODE loop completed'
+      END DO ! 1 or 2 trajectories
+      !write(*,*) 'trajectories  loop completed'
+
+      DEALLOCATE(A)
+      DEALLOCATE(AC)
+      DEALLOCATE(DA)
+      DEALLOCATE(DL)
+      DEALLOCATE(F)
+      DEALLOCATE(F1)
+      DEALLOCATE(F2)
+      DEALLOCATE(X)
+      DEALLOCATE(X0)
+      DEALLOCATE(SINGVX)
+      DEALLOCATE(SINGVQX)
+      DEALLOCATE(Y)
+      DEALLOCATE(Y0)
+      DEALLOCATE(Y1)
+      DEALLOCATE(Z)
+      DEALLOCATE(Z1)
+      DEALLOCATE(RES)
+      DEALLOCATE(RES1)
+      DEALLOCATE(RESEX)
+      DEALLOCATE(REIG)
+      DEALLOCATE(IEIG)
+      DEALLOCATE(REIGQ)
+      DEALLOCATE(IEIGQ)
+      DEALLOCATE(REIGA)
+      DEALLOCATE(IEIGA)
+      DEALLOCATE(VA)
+      DEALLOCATE(LAMBDA)
+      DEALLOCATE(LAMBDAQ)
+      DEALLOCATE(EIGA)
+      DEALLOCATE(W)
+      DEALLOCATE(AU)
+      DEALLOCATE(S)
+
+!............................................................
+      !     Generate random M-by-M matrix A. Use DLATMR from
+      END DO ! LLOOP
+
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for DGEDMD :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+      IF ( NFAIL_Z_XV == 0 ) THEN
+          WRITE(*,*) '>>>> Z - U*V test PASSED.'
+      ELSE
+          WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)'
+          WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_ZXW
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_Z_XV
+      END IF
+      IF ( NFAIL_AU == 0 ) THEN
+          WRITE(*,*) '>>>> A*U test PASSED. '
+      ELSE
+          WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)'
+          WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU
+      END IF
+
+      IF ( NFAIL_REZ == 0 ) THEN
+          WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+          WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)'
+          WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ
+      END IF
+
+      IF ( NFAIL_TOTAL == 0 ) THEN
+          WRITE(*,*) '>>>> DGEDMD :: ALL TESTS PASSED.'
+      ELSE
+          WRITE(*,*) NFAIL_TOTAL, 'FAILURES!'
+          WRITE(*,*) '>>>>>>>>>>>>>> DGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      IF ( TEST_QRDMD ) THEN
+      WRITE(*,*)
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for DGEDMDQ :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+
+      IF ( NFAIL_SVDIFF == 0 ) THEN
+          WRITE(*,*) '>>>> DGEDMD and DGEDMDQ computed singular &
+              &values test PASSED.'
+      ELSE
+          WRITE(*,*) 'DGEDMD and DGEDMDQ discrepancies in &
+              &the singular values unacceptable ', &
+              NFAIL_SVDIFF, ' times. Test FAILED.'
+          WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF
+      END IF
+
+      IF ( NFAIL_F_QR == 0 ) THEN
+          WRITE(*,*) '>>>> F - Q*R test PASSED.'
+      ELSE
+          WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)'
+          WRITE(*,*) 'The largest relative residual was ', TMP_FQR
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR
+      END IF
+
+      IF ( NFAIL_REZQ == 0 ) THEN
+          WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+          WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)'
+          WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ
+      END IF
+
+      IF ( NFAILQ_TOTAL == 0 ) THEN
+          WRITE(*,*) '>>>>>>> DGEDMDQ :: ALL TESTS PASSED.'
+      ELSE
+         WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!'
+         WRITE(*,*) '>>>>>>> DGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      END IF
+
+      WRITE(*,*)
+      WRITE(*,*) 'Test completed.'
+      STOP
+      END
diff --git a/lapack-netlib/TESTING/EIG/derrst.f b/lapack-netlib/TESTING/EIG/derrst.f
index 059538644..7d111e2e0 100644
--- a/lapack-netlib/TESTING/EIG/derrst.f
+++ b/lapack-netlib/TESTING/EIG/derrst.f
@@ -735,12 +735,12 @@
          CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK )
          INFOT = 18
          CALL DSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 26*N-1, IW( 2*N+1 ), 10*N,
+     $                M, R, Z, 1, IW, Q, 0, IW( 2*N+1 ), 10*N,
      $                INFO )
          CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL DSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 10*N-1,
+     $                M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 0,
      $                INFO )
          CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK )
          NT = NT + 11
diff --git a/lapack-netlib/TESTING/EIG/schkdmd.f90 b/lapack-netlib/TESTING/EIG/schkdmd.f90
new file mode 100644
index 000000000..77e3e46c0
--- /dev/null
+++ b/lapack-netlib/TESTING/EIG/schkdmd.f90
@@ -0,0 +1,792 @@
+!     This is a test program for checking the implementations of
+!     the implementations of the following subroutines
+!
+!     SGEDMD  for computation of the
+!             Dynamic Mode Decomposition (DMD)
+!     SGEDMDQ for computation of a
+!             QR factorization based compressed DMD
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     ========================================================
+!     How to run the code (compiler, link info)
+!     ========================================================
+!     Compile as FORTRAN 90 (or later) and link with BLAS and
+!     LAPACK libraries.
+!     NOTE: The code is developed and tested on top of the
+!     Intel MKL library (versions 2022.0.3 and 2022.2.0),
+!     using the Intel Fortran compiler.
+!
+!     For developers of the C++ implementation
+!     ========================================================
+!     See the LAPACK++ and Template Numerical Toolkit (TNT)
+!
+!     Note on a development of the GPU HP implementation
+!     ========================================================
+!     Work in progress. See CUDA, MAGMA, SLATE.
+!     NOTE: The four SVD subroutines used in this code are
+!     included as a part of R&D and for the completeness.
+!     This was also an opportunity to test those SVD codes.
+!     If the scaling option is used all four are essentially
+!     equally good. For implementations on HP platforms,
+!     one can use whichever SVD is available.
+!...  .........................................................
+!     NOTE:
+!     When using the Intel MKL 2022.0.3 the subroutine xGESVDQ
+!     (optionally used in xGEDMD) may cause access violation
+!     error for x = S, D, C, Z, but only if called with the
+!     work space query. (At least in our Windows 10 MSVS 2019.)
+!     The problem can be mitigated by downloading the source
+!     code of xGESVDQ from the LAPACK repository and use it
+!     localy instead of the one in the MKL. This seems to
+!     indicate that the problem is indeed in the MKL.
+!     This problem did not appear whith Intel MKL 2022.2.0.
+!
+!     NOTE:
+!     xGESDD seems to have a problem with workspace. In some
+!     cases the length of the optimal workspace is returned
+!     smaller than the minimal workspace, as specified in the
+!     code. As a precaution, all optimal workspaces are
+!     set as MAX(minimal, optimal).
+!     Latest implementations of complex xGESDD have different
+!     length of the real worksapce. We use max value over
+!     two versions.
+!............................................................
+!............................................................
+!
+      PROGRAM DMD_TEST
+      use iso_fortran_env, only: real32
+      IMPLICIT NONE
+      integer, parameter :: WP = real32
+
+!............................................................
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!............................................................
+      REAL(KIND=WP), ALLOCATABLE, DIMENSION(:,:) ::          &
+                     A, AC, EIGA, LAMBDA, LAMBDAQ, F, F1, F2,&
+                     Z, Z1, S, AU, W, VA, X, X0, Y, Y0, Y1
+      REAL(KIND=WP), ALLOCATABLE, DIMENSION(:)   ::          &
+                     DA, DL, DR, REIG, REIGA, REIGQ, IEIG,   &
+                     IEIGA, IEIGQ,  RES, RES1, RESEX, SINGVX,&
+                     SINGVQX, WORK
+      INTEGER      , ALLOCATABLE, DIMENSION(:)   ::   IWORK
+      REAL(KIND=WP) :: AB(2,2),   WDUMMY(2)
+      INTEGER       :: IDUMMY(2), ISEED(4), RJOBDATA(8)
+      REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, DMAX, EPS, &
+                       TOL, TOL2, SVDIFF, TMP, TMP_AU,       &
+                       TMP_FQR, TMP_REZ, TMP_REZQ,  TMP_ZXW, &
+                       TMP_EX, XNORM, YNORM
+!............................................................
+      INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY,  &
+                 LDZ, LIWORK, LWORK, M, N, L, LLOOP, NRNK
+      INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, KDIFF,     &
+                 NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ,     &
+                 NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL, &
+                 NFAIL_Z_XV, MODE, MODEL, MODER, WHTSVD
+      INTEGER    iNRNK, iWHTSVD, K_TRAJ, LWMINOPT
+      CHARACTER(LEN=1) GRADE, JOBREF, JOBZ, PIVTNG, RSIGN,   &
+                       SCALE, RESIDS, WANTQ, WANTR
+
+      LOGICAL          TEST_QRDMD
+!..... external subroutines (BLAS and LAPACK)
+      EXTERNAL SAXPY,  SGEEV, SGEMM, SGEMV, SLACPY, SLASCL
+      EXTERNAL SLARNV, SLATMR
+!.....external subroutines DMD package, part 1
+!     subroutines under test
+      EXTERNAL SGEDMD, SGEDMDQ
+
+!..... external functions (BLAS and LAPACK)
+      EXTERNAL         SLAMCH, SLANGE, SNRM2
+      REAL(KIND=WP) :: SLAMCH, SLANGE, SNRM2
+      EXTERNAL         LSAME
+      LOGICAL          LSAME
+
+      INTRINSIC ABS, INT, MIN, MAX
+!............................................................
+
+      ! The test is always in pairs : ( SGEDMD and SGEDMDQ )
+      ! because the test includes comparing the results (in pairs).
+!.....................................................................................
+      TEST_QRDMD = .TRUE. ! This code by default performs tests on SGEDMDQ
+                          ! Since the QR factorizations based algorithm is designed for
+                          ! single trajectory data, only single trajectory tests will
+                          ! be performed with xGEDMDQ.
+      WANTQ = 'Q'
+      WANTR = 'R'
+!.................................................................................
+
+      EPS = SLAMCH( 'P' )  ! machine precision SP
+
+      ! Global counters of failures of some particular tests
+      NFAIL      = 0
+      NFAIL_REZ  = 0
+      NFAIL_REZQ = 0
+      NFAIL_Z_XV = 0
+      NFAIL_F_QR = 0
+      NFAIL_AU   = 0
+      KDIFF      = 0
+      NFAIL_SVDIFF = 0
+      NFAIL_TOTAL  = 0
+      NFAILQ_TOTAL = 0
+
+
+      DO LLOOP = 1, 4
+
+      WRITE(*,*) 'L Loop Index = ', LLOOP
+
+      ! Set the dimensions of the problem ...
+      WRITE(*,*) 'M = '
+      READ(*,*) M
+      WRITE(*,*) M
+      ! ... and the number of snapshots.
+      WRITE(*,*) 'N = '
+      READ(*,*) N
+      WRITE(*,*) N
+
+      ! ... Test the dimensions
+      IF ( ( MIN(M,N) == 0 ) .OR. ( M < N )  ) THEN
+          WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.'
+          STOP
+      END IF
+!.............
+      ! The seed inside the LLOOP so that each pass can be reproduced easily.
+
+      ISEED(1) = 4
+      ISEED(2) = 3
+      ISEED(3) = 2
+      ISEED(4) = 1
+
+      LDA = M
+      LDF = M
+      LDX = MAX(M,N+1)
+      LDY = MAX(M,N+1)
+      LDW = N
+      LDZ = M
+      LDAU = MAX(M,N+1)
+      LDS = N
+
+      TMP_ZXW  = ZERO
+      TMP_AU   = ZERO
+      TMP_REZ  = ZERO
+      TMP_REZQ = ZERO
+      SVDIFF   = ZERO
+      TMP_EX   = ZERO
+
+      !
+      ! Test the subroutines on real data snapshots. All
+      ! computation is done in real arithmetic, even when
+      ! Koopman eigenvalues and modes are real.
+      !
+      ! Allocate memory space
+      ALLOCATE( A(LDA,M) )
+      ALLOCATE( AC(LDA,M) )
+      ALLOCATE( DA(M) )
+      ALLOCATE( DL(M) )
+      ALLOCATE( F(LDF,N+1) )
+      ALLOCATE( F1(LDF,N+1) )
+      ALLOCATE( F2(LDF,N+1) )
+      ALLOCATE( X(LDX,N) )
+      ALLOCATE( X0(LDX,N) )
+      ALLOCATE( SINGVX(N) )
+      ALLOCATE( SINGVQX(N) )
+      ALLOCATE( Y(LDY,N+1) )
+      ALLOCATE( Y0(LDY,N+1) )
+      ALLOCATE( Y1(M,N+1) )
+      ALLOCATE( Z(LDZ,N) )
+      ALLOCATE( Z1(LDZ,N) )
+      ALLOCATE( RES(N)  )
+      ALLOCATE( RES1(N) )
+      ALLOCATE( RESEX(N) )
+      ALLOCATE( REIG(N) )
+      ALLOCATE( IEIG(N) )
+      ALLOCATE( REIGQ(N) )
+      ALLOCATE( IEIGQ(N) )
+      ALLOCATE( REIGA(M) )
+      ALLOCATE( IEIGA(M) )
+      ALLOCATE( VA(LDA,M) )
+      ALLOCATE( LAMBDA(N,2) )
+      ALLOCATE( LAMBDAQ(N,2) )
+      ALLOCATE( EIGA(M,2) )
+      ALLOCATE( W(LDW,N) )
+      ALLOCATE( AU(LDAU,N) )
+      ALLOCATE( S(N,N) )
+
+      TOL  = M*EPS
+      ! This mimics O(M*N)*EPS bound for accumulated roundoff error.
+      ! The factor 10 is somewhat arbitrary.
+      TOL2 = 10*M*N*EPS
+
+!.............
+
+      DO K_TRAJ = 1, 2
+      !  Number of intial conditions in the simulation/trajectories (1 or 2)
+
+      COND = 1.0D8
+      DMAX = 1.0D2
+      RSIGN = 'F'
+      GRADE = 'N'
+      MODEL = 6
+      CONDL = 1.0D2
+      MODER = 6
+      CONDR = 1.0D2
+      PIVTNG = 'N'
+
+      ! Loop over all parameter MODE values for ZLATMR (+1,..,+6)
+      DO MODE = 1, 6
+
+      ALLOCATE( IWORK(2*M) )
+      ALLOCATE(DR(N))
+      CALL SLATMR( M, M, 'S', ISEED, 'N', DA, MODE, COND, &
+                   DMAX, RSIGN, GRADE, DL, MODEL,  CONDL, &
+                   DR, MODER, CONDR, PIVTNG, IWORK, M, M, &
+                   ZERO, -ONE, 'N', A, LDA, IWORK(M+1), INFO )
+      DEALLOCATE(IWORK)
+      DEALLOCATE(DR)
+
+      LWORK = 4*M+1
+      ALLOCATE(WORK(LWORK))
+      AC  = A
+      CALL SGEEV( 'N','V', M, AC, M, REIGA, IEIGA, VA, M, &
+                  VA, M, WORK, LWORK, INFO ) ! LAPACK CALL
+      DEALLOCATE(WORK)
+      TMP = ZERO
+      DO i = 1, M
+          EIGA(i,1) = REIGA(i)
+          EIGA(i,2) = IEIGA(i)
+          TMP = MAX( TMP, SQRT(REIGA(i)**2+IEIGA(i)**2))
+      END DO
+
+      ! Scale A to have the desirable spectral radius.
+      CALL SLASCL( 'G', 0, 0, TMP, ONE, M, M, A, M, INFO )
+      CALL SLASCL( 'G', 0, 0, TMP, ONE, M, 2, EIGA, M, INFO )
+
+      ! Compute the norm of A
+      ANORM = SLANGE( 'F', N, N, A, M, WDUMMY )
+
+      IF ( K_TRAJ == 2 ) THEN
+          ! generate data with two inital conditions
+      CALL SLARNV(2, ISEED, M, F1(1,1) )
+      F1(1:M,1) = 1.0E-10*F1(1:M,1)
+      DO i = 1, N/2
+         CALL SGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, &
+              F1(1,i+1), 1 )
+      END DO
+      X0(1:M,1:N/2) = F1(1:M,1:N/2)
+      Y0(1:M,1:N/2) = F1(1:M,2:N/2+1)
+
+      CALL SLARNV(2, ISEED, M, F1(1,1) )
+      DO i = 1, N-N/2
+         CALL SGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, &
+              F1(1,i+1), 1 )
+      END DO
+      X0(1:M,N/2+1:N) = F1(1:M,1:N-N/2)
+      Y0(1:M,N/2+1:N) = F1(1:M,2:N-N/2+1)
+      ELSE
+          ! single trajectory
+      CALL SLARNV(2, ISEED, M, F(1,1) )
+      DO i = 1, N
+         CALL SGEMV( 'N', M, M, ONE, A, M, F(1,i), 1, ZERO, &
+              F(1,i+1), 1 )
+      END DO
+      X0(1:M,1:N) = F(1:M,1:N)
+      Y0(1:M,1:N) = F(1:M,2:N+1)
+      END IF
+
+      XNORM = SLANGE( 'F', M, N, X0, LDX, WDUMMY )
+      YNORM = SLANGE( 'F', M, N, Y0, LDX, WDUMMY )
+!............................................................
+
+      DO iJOBZ = 1, 4
+
+          SELECT CASE ( iJOBZ )
+          CASE(1)
+              JOBZ   = 'V' ! Ritz vectors will be computed
+              RESIDS = 'R' ! Residuals will be computed
+          CASE(2)
+              JOBZ   = 'V'
+              RESIDS = 'N'
+          CASE(3)
+              JOBZ   = 'F' ! Ritz vectors in factored form
+              RESIDS = 'N'
+          CASE(4)
+              JOBZ   = 'N'
+              RESIDS = 'N'
+          END SELECT
+
+      DO iJOBREF = 1, 3
+
+          SELECT CASE ( iJOBREF )
+          CASE(1)
+              JOBREF = 'R' ! Data for refined Ritz vectors
+          CASE(2)
+              JOBREF = 'E' ! Exact DMD vectors
+          CASE(3)
+              JOBREF = 'N'
+          END SELECT
+
+      DO iSCALE = 1, 4
+
+          SELECT CASE ( iSCALE )
+          CASE(1)
+              SCALE = 'S' ! X data normalized
+          CASE(2)
+              SCALE = 'C' ! X normalized, consist. check
+          CASE(3)
+              SCALE = 'Y' ! Y data normalized
+          CASE(4)
+              SCALE = 'N'
+          END SELECT
+
+      DO iNRNK = -1, -2, -1
+          ! Two truncation strategies. The "-2" case for R&D
+          ! purposes only - it uses possibly low accuracy small
+          ! singular values, in which case the formulas used in
+          ! the DMD are highly sensitive.
+          NRNK   = iNRNK
+
+      DO iWHTSVD = 1, 4
+          ! Check all four options to compute the POD basis
+          ! via the SVD.
+          WHTSVD   = iWHTSVD
+
+      DO LWMINOPT = 1, 2
+          ! Workspace query for the minimal (1) and for the optimal
+          ! (2) workspace lengths determined by workspace query.
+
+       X(1:M,1:N) = X0(1:M,1:N)
+       Y(1:M,1:N) = Y0(1:M,1:N)
+
+       ! SGEDMD: Workspace query and workspace allocation
+       CALL SGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, &
+            N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, &
+            LDZ, RES, AU, LDAU, W, LDW, S, LDS, WDUMMY, -1, &
+            IDUMMY, -1, INFO )
+
+       LIWORK = IDUMMY(1)
+       ALLOCATE( IWORK(LIWORK) )
+       LWORK = INT(WDUMMY(LWMINOPT))
+       ALLOCATE( WORK(LWORK) )
+
+       ! SGEDMD test: CALL SGEDMD
+       CALL SGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, &
+            N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, &
+            LDZ, RES, AU, LDAU, W, LDW, S, LDS, WORK, LWORK,&
+            IWORK, LIWORK, INFO )
+
+       SINGVX(1:N) = WORK(1:N)
+
+       !...... SGEDMD check point
+       IF ( LSAME(JOBZ,'V')  ) THEN
+          ! Check that Z = X*W, on return from SGEDMD
+          ! This checks that the returned aigenvectors in Z are
+          ! the product of the SVD'POD basis returned in X
+          ! and the eigenvectors of the rayleigh quotient
+          ! returned in W
+          CALL SGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, &
+                      ZERO, Z1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+             CALL SAXPY( M, -ONE, Z(1,i), 1, Z1(1,i), 1)
+             TMP = MAX(TMP, SNRM2( M, Z1(1,i), 1 ) )
+          END DO
+          TMP_ZXW = MAX(TMP_ZXW, TMP )
+
+          IF ( TMP_ZXW > 10*M*EPS ) THEN
+              NFAIL_Z_XV = NFAIL_Z_XV + 1
+          END IF
+
+       END IF
+
+       !...... SGEDMD check point
+       IF ( LSAME(JOBREF,'R') ) THEN
+           ! The matrix A*U is returned for computing refined Ritz vectors.
+           ! Check that A*U is computed correctly using the formula
+           ! A*U = Y * V * inv(SIGMA). This depends on the
+           ! accuracy in the computed singular values and vectors of X.
+           ! See the paper for an error analysis.
+           ! Note that the left singular vectors of the input matrix X
+           ! are returned in the array X.
+           CALL SGEMM( 'N', 'N', M, K, M, ONE, A, LDA, X, LDX, &
+                      ZERO, Z1, LDZ )
+           TMP = ZERO
+           DO i = 1, K
+              CALL SAXPY( M, -ONE, AU(1,i), 1, Z1(1,i), 1)
+              TMP = MAX( TMP, SNRM2( M, Z1(1,i),1 ) * &
+                       SINGVX(K)/(ANORM*SINGVX(1)) )
+           END DO
+           TMP_AU = MAX( TMP_AU, TMP )
+
+           IF ( TMP > TOL2 ) THEN
+               NFAIL_AU = NFAIL_AU + 1
+           END IF
+
+       ELSEIF ( LSAME(JOBREF,'E') ) THEN
+       ! The unscaled vectors of the Exact DMD are computed.
+       ! This option is included for the sake of completeness,
+       ! for users who prefer the Exact DMD vectors. The
+       ! returned vectors are in the real form, in the same way
+       ! as the Ritz vectors. Here we just save the vectors
+       ! and test them separately using a Matlab script.
+
+       CALL SGEMM( 'N', 'N', M, K, M, ONE, A, LDA, AU, LDAU, ZERO, Y1, M )
+       i=1
+       DO WHILE ( i <= K )
+       IF ( IEIG(i) == ZERO ) THEN
+        ! have a real eigenvalue with real eigenvector
+        CALL SAXPY( M, -REIG(i), AU(1,i), 1, Y1(1,i), 1 )
+        RESEX(i) = SNRM2( M, Y1(1,i), 1) / SNRM2(M,AU(1,i),1)
+        i = i + 1
+       ELSE
+       ! Have a complex conjugate pair
+       ! REIG(i) +- sqrt(-1)*IMEIG(i).
+       ! Since all computation is done in real
+       ! arithmetic, the formula for the residual
+       ! is recast for real representation of the
+       ! complex conjugate eigenpair. See the
+       ! description of RES.
+       AB(1,1) =  REIG(i)
+       AB(2,1) = -IEIG(i)
+       AB(1,2) =  IEIG(i)
+       AB(2,2) =  REIG(i)
+       CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, AU(1,i), &
+                   M, AB, 2, ONE, Y1(1,i), M )
+       RESEX(i)   = SLANGE( 'F', M, 2, Y1(1,i), M, &
+                    WORK )/ SLANGE( 'F', M, 2, AU(1,i), M, &
+                    WORK )
+       RESEX(i+1) = RESEX(i)
+       i = i + 2
+       END IF
+       END DO
+
+       END IF
+
+      !...... SGEDMD check point
+      IF ( LSAME(RESIDS, 'R') ) THEN
+          ! Compare the residuals returned by SGEDMD with the
+          ! explicitly computed residuals using the matrix A.
+          ! Compute explicitly Y1 = A*Z
+          CALL SGEMM( 'N', 'N', M, K, M, ONE, A, LDA, Z, LDZ, ZERO, Y1, M )
+          ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+          ! of the invariant subspaces that correspond to complex conjugate
+          ! pairs of eigencalues. (See the description of Z in SGEDMD,)
+          i = 1
+          DO WHILE ( i <= K )
+            IF ( IEIG(i) == ZERO ) THEN
+                ! have a real eigenvalue with real eigenvector
+                CALL SAXPY( M, -REIG(i), Z(1,i), 1, Y1(1,i), 1 )
+                RES1(i) = SNRM2( M, Y1(1,i), 1)
+                i = i + 1
+            ELSE
+               ! Have a complex conjugate pair
+               ! REIG(i) +- sqrt(-1)*IMEIG(i).
+               ! Since all computation is done in real
+               ! arithmetic, the formula for the residual
+               ! is recast for real representation of the
+               ! complex conjugate eigenpair. See the
+               ! description of RES.
+               AB(1,1) =  REIG(i)
+               AB(2,1) = -IEIG(i)
+               AB(1,2) =  IEIG(i)
+               AB(2,2) =  REIG(i)
+               CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), &
+                           M, AB, 2, ONE, Y1(1,i), M )
+               RES1(i)   = SLANGE( 'F', M, 2, Y1(1,i), M, &
+                                  WORK )
+               RES1(i+1) = RES1(i)
+               i = i + 2
+            END IF
+          END DO
+          TMP = ZERO
+          DO i = 1, K
+          TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                    SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_REZ = MAX( TMP_REZ, TMP )
+
+          IF ( TMP > TOL2 ) THEN
+              NFAIL_REZ = NFAIL_REZ + 1
+          END IF
+
+         IF ( LSAME(JOBREF,'E') ) THEN
+            TMP = ZERO
+          DO i = 1, K
+          TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) )
+          END DO
+          TMP_EX = MAX(TMP_EX,TMP)
+         END IF
+
+      END IF
+
+      ! ... store the results for inspection
+      DO i = 1, K
+          LAMBDA(i,1) = REIG(i)
+          LAMBDA(i,2) = IEIG(i)
+      END DO
+
+      DEALLOCATE(IWORK)
+      DEALLOCATE(WORK)
+
+      !======================================================================
+      !     Now test the SGEDMDQ, if requested.
+      !======================================================================
+      IF ( TEST_QRDMD .AND. (K_TRAJ == 1) ) THEN
+          RJOBDATA(2) = 1
+          F1 = F
+
+          ! SGEDMDQ test: Workspace query and workspace allocation
+          CALL SGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, &
+               JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, &
+               LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ,   &
+               RES, AU, LDAU, W, LDW, S, LDS, WDUMMY,      &
+               -1, IDUMMY, -1, INFO )
+          LIWORK = IDUMMY(1)
+          ALLOCATE( IWORK(LIWORK) )
+          LWORK = INT(WDUMMY(LWMINOPT))
+          ALLOCATE(WORK(LWORK))
+
+          ! SGEDMDQ test: CALL SGEDMDQ
+          CALL SGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, &
+               JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, &
+               LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ,   &
+               RES, AU, LDAU, W, LDW, S, LDS,              &
+               WORK, LWORK, IWORK, LIWORK, INFO )
+
+          SINGVQX(1:KQ) = WORK(MIN(M,N+1)+1: MIN(M,N+1)+KQ)
+
+          !..... SGEDMDQ check point
+          IF ( KQ /= K ) THEN
+             KDIFF = KDIFF+1
+          END IF
+
+          TMP = ZERO
+          DO i = 1, MIN(K, KQ)
+             TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / &
+                                   SINGVX(1) )
+          END DO
+          SVDIFF = MAX( SVDIFF, TMP )
+          IF ( TMP > M*N*EPS ) THEN
+             NFAIL_SVDIFF = NFAIL_SVDIFF + 1
+          END IF
+
+          !..... SGEDMDQ check point
+          IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN
+             ! Check that the QR factors are computed and returned
+             ! as requested. The residual ||F-Q*R||_F / ||F||_F
+             ! is compared to M*N*EPS.
+             F2 = F
+             CALL SGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -ONE, F1, &
+                         LDF, Y, LDY, ONE, F2, LDF )
+             TMP_FQR = SLANGE( 'F', M, N+1, F2, LDF, WORK ) / &
+                   SLANGE( 'F', M, N+1, F,  LDF, WORK )
+             IF ( TMP_FQR > TOL2 ) THEN
+                 NFAIL_F_QR = NFAIL_F_QR + 1
+             END IF
+          END IF
+
+          !..... SGEDMDQ checkpoint
+          IF ( LSAME(RESIDS, 'R') ) THEN
+              ! Compare the residuals returned by SGEDMDQ with the
+              ! explicitly computed residuals using the matrix A.
+              ! Compute explicitly Y1 = A*Z
+              CALL SGEMM( 'N', 'N', M, KQ, M, ONE, A, M, Z, M, ZERO, Y1, M )
+              ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+              ! of the invariant subspaces that correspond to complex conjugate
+              ! pairs of eigencalues. (See the description of Z in SGEDMDQ)
+              i = 1
+              DO WHILE ( i <= KQ )
+                IF ( IEIGQ(i) == ZERO ) THEN
+                    ! have a real eigenvalue with real eigenvector
+                    CALL SAXPY( M, -REIGQ(i), Z(1,i), 1, Y1(1,i), 1 )
+                    ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i)
+                    RES1(i) = SNRM2( M, Y1(1,i), 1)
+                    i = i + 1
+                ELSE
+                   ! Have a complex conjugate pair
+                   ! REIG(i) +- sqrt(-1)*IMEIG(i).
+                   ! Since all computation is done in real
+                   ! arithmetic, the formula for the residual
+                   ! is recast for real representation of the
+                   ! complex conjugate eigenpair. See the
+                   ! description of RES.
+                   AB(1,1) =  REIGQ(i)
+                   AB(2,1) = -IEIGQ(i)
+                   AB(1,2) =  IEIGQ(i)
+                   AB(2,2) =  REIGQ(i)
+                   CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), &
+                               M, AB, 2, ONE, Y1(1,i), M )             ! BLAS CALL
+                   ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB   ! INTRINSIC
+                   RES1(i)   = SLANGE( 'F', M, 2, Y1(1,i), M, &
+                                      WORK )                           ! LAPACK CALL
+                   RES1(i+1) = RES1(i)
+                   i = i + 2
+                END IF
+              END DO
+              TMP = ZERO
+              DO i = 1, KQ
+              TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                  SINGVQX(K)/(ANORM*SINGVQX(1)) )
+              END DO
+              TMP_REZQ = MAX( TMP_REZQ, TMP )
+              IF ( TMP > TOL2 ) THEN
+                  NFAIL_REZQ = NFAIL_REZQ + 1
+              END IF
+
+          END IF
+
+          DO i = 1, KQ
+              LAMBDAQ(i,1) = REIGQ(i)
+              LAMBDAQ(i,2) = IEIGQ(i)
+          END DO
+
+      DEALLOCATE(WORK)
+      DEALLOCATE(IWORK)
+      END IF            ! TEST_QRDMD
+!======================================================================
+
+      END DO ! LWMINOPT
+      !write(*,*) 'LWMINOPT loop completed'
+      END DO ! WHTSVD LOOP
+      !write(*,*) 'WHTSVD loop completed'
+      END DO ! NRNK LOOP
+      !write(*,*) 'NRNK loop completed'
+      END DO ! SCALE LOOP
+      !write(*,*) 'SCALE loop completed'
+      END DO ! JOBF LOOP
+      !write(*,*) 'JOBREF loop completed'
+      END DO ! JOBZ LOOP
+      !write(*,*) 'JOBZ loop completed'
+
+      END DO ! MODE -6:6
+      !write(*,*) 'MODE loop completed'
+      END DO ! 1 or 2 trajectories
+      !write(*,*) 'trajectories  loop completed'
+
+      DEALLOCATE(A)
+      DEALLOCATE(AC)
+      DEALLOCATE(DA)
+      DEALLOCATE(DL)
+      DEALLOCATE(F)
+      DEALLOCATE(F1)
+      DEALLOCATE(F2)
+      DEALLOCATE(X)
+      DEALLOCATE(X0)
+      DEALLOCATE(SINGVX)
+      DEALLOCATE(SINGVQX)
+      DEALLOCATE(Y)
+      DEALLOCATE(Y0)
+      DEALLOCATE(Y1)
+      DEALLOCATE(Z)
+      DEALLOCATE(Z1)
+      DEALLOCATE(RES)
+      DEALLOCATE(RES1)
+      DEALLOCATE(RESEX)
+      DEALLOCATE(REIG)
+      DEALLOCATE(IEIG)
+      DEALLOCATE(REIGQ)
+      DEALLOCATE(IEIGQ)
+      DEALLOCATE(REIGA)
+      DEALLOCATE(IEIGA)
+      DEALLOCATE(VA)
+      DEALLOCATE(LAMBDA)
+      DEALLOCATE(LAMBDAQ)
+      DEALLOCATE(EIGA)
+      DEALLOCATE(W)
+      DEALLOCATE(AU)
+      DEALLOCATE(S)
+
+!............................................................
+      !     Generate random M-by-M matrix A. Use DLATMR from
+      END DO ! LLOOP
+
+
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for SGEDMD :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+      IF ( NFAIL_Z_XV == 0 ) THEN
+          WRITE(*,*) '>>>> Z - U*V test PASSED.'
+      ELSE
+          WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)'
+          WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_ZXW
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_Z_XV
+      END IF
+      IF ( NFAIL_AU == 0 ) THEN
+          WRITE(*,*) '>>>> A*U test PASSED. '
+      ELSE
+          WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)'
+          WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU
+      END IF
+
+      IF ( NFAIL_REZ == 0 ) THEN
+          WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+          WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)'
+          WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ
+      END IF
+
+      IF ( NFAIL_TOTAL == 0 ) THEN
+          WRITE(*,*) '>>>> SGEDMD :: ALL TESTS PASSED.'
+      ELSE
+          WRITE(*,*) NFAIL_TOTAL, 'FAILURES!'
+          WRITE(*,*) '>>>>>>>>>>>>>> SGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      IF ( TEST_QRDMD ) THEN
+      WRITE(*,*)
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for SGEDMDQ :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+
+      IF ( NFAIL_SVDIFF == 0 ) THEN
+          WRITE(*,*) '>>>> SGEDMD and SGEDMDQ computed singular &
+              &values test PASSED.'
+      ELSE
+          WRITE(*,*) 'SGEDMD and SGEDMDQ discrepancies in &
+              &the singular values unacceptable ', &
+              NFAIL_SVDIFF, ' times. Test FAILED.'
+          WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF
+      END IF
+
+      IF ( NFAIL_F_QR == 0 ) THEN
+          WRITE(*,*) '>>>> F - Q*R test PASSED.'
+      ELSE
+          WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)'
+          WRITE(*,*) 'The largest relative residual was ', TMP_FQR
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR
+      END IF
+
+      IF ( NFAIL_REZQ == 0 ) THEN
+          WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+          WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)'
+          WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ
+      END IF
+
+      IF ( NFAILQ_TOTAL == 0 ) THEN
+          WRITE(*,*) '>>>>>>> SGEDMDQ :: ALL TESTS PASSED.'
+      ELSE
+         WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!'
+         WRITE(*,*) '>>>>>>> SGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      END IF
+
+      WRITE(*,*)
+      WRITE(*,*) 'Test completed.'
+      STOP
+      END
diff --git a/lapack-netlib/TESTING/EIG/serrst.f b/lapack-netlib/TESTING/EIG/serrst.f
index b87fc42ef..408346382 100644
--- a/lapack-netlib/TESTING/EIG/serrst.f
+++ b/lapack-netlib/TESTING/EIG/serrst.f
@@ -733,12 +733,12 @@
          CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK )
          INFOT = 18
          CALL SSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 26*N-1, IW( 2*N+1 ), 10*N,
+     $                M, R, Z, 1, IW, Q, 0, IW( 2*N+1 ), 10*N,
      $                INFO )
          CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL SSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 10*N-1,
+     $                M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 0,
      $                INFO )
          CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK )
          NT = NT + 11
diff --git a/lapack-netlib/TESTING/EIG/zchkdmd.f90 b/lapack-netlib/TESTING/EIG/zchkdmd.f90
new file mode 100644
index 000000000..873d956c4
--- /dev/null
+++ b/lapack-netlib/TESTING/EIG/zchkdmd.f90
@@ -0,0 +1,745 @@
+!     This is a test program for checking the implementations of
+!     the implementations of the following subroutines
+!
+!     ZGEDMD,  for computation of the
+!              Dynamic Mode Decomposition (DMD)
+!     ZGEDMDQ, for computation of a
+!              QR factorization based compressed DMD
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     ========================================================
+!     How to run the code (compiler, link info)
+!     ========================================================
+!     Compile as FORTRAN 90 (or later) and link with BLAS and
+!     LAPACK libraries.
+!     NOTE: The code is developed and tested on top of the
+!     Intel MKL library (versions 2022.0.3 and 2022.2.0),
+!     using the Intel Fortran compiler.
+!
+!     For developers of the C++ implementation
+!     ========================================================
+!     See the LAPACK++ and Template Numerical Toolkit (TNT)
+!
+!     Note on a development of the GPU HP implementation
+!     ========================================================
+!     Work in progress. See CUDA, MAGMA, SLATE.
+!     NOTE: The four SVD subroutines used in this code are
+!     included as a part of R&D and for the completeness.
+!     This was also an opportunity to test those SVD codes.
+!     If the scaling option is used all four are essentially
+!     equally good. For implementations on HP platforms,
+!     one can use whichever SVD is available.
+!............................................................
+
+!............................................................
+!............................................................
+!
+      PROGRAM DMD_TEST
+      use iso_fortran_env, only: real64
+      IMPLICIT NONE
+      integer, parameter :: WP = real64
+
+!............................................................
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+
+      COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
+      COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
+!............................................................
+      REAL(KIND=WP), ALLOCATABLE, DIMENSION(:)   :: RES, &
+                     RES1, RESEX, SINGVX, SINGVQX, WORK
+      INTEGER      , ALLOCATABLE, DIMENSION(:)   :: IWORK
+      REAL(KIND=WP) :: WDUMMY(2)
+      INTEGER       :: IDUMMY(4), ISEED(4)
+      REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, EPS,       &
+                       TOL, TOL2, SVDIFF, TMP, TMP_AU,       &
+                       TMP_FQR, TMP_REZ, TMP_REZQ,  TMP_ZXW, &
+                       TMP_EX
+
+!............................................................
+      COMPLEX(KIND=WP) :: ZMAX
+      INTEGER :: LZWORK
+      COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:,:) ::  ZA, ZAC,  &
+                                 ZAU, ZF, ZF0, ZF1, ZS, ZW,       &
+                                 ZX, ZX0, ZY, ZY0, ZY1, ZZ, ZZ1
+      COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:)   ::  ZDA, ZDR, &
+                                       ZDL, ZEIGS, ZEIGSA, ZWORK
+      COMPLEX(KIND=WP) ::  ZDUMMY(22), ZDUM2X2(2,2)
+!............................................................
+      INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY,  &
+                 LDZ, LIWORK, LWORK, M, N, LLOOP, NRNK, NRNKsp
+      INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, j,     &
+                 NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ,     &
+                 NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL,  &
+                 NFAIL_Z_XV,  MODE, MODEL, MODER, WHTSVD,     &
+                 WHTSVDsp
+      INTEGER :: iNRNK, iWHTSVD,  K_TRAJ, LWMINOPT
+      CHARACTER :: GRADE, JOBREF, JOBZ, PIVTNG, RSIGN,   &
+                       SCALE, RESIDS, WANTQ, WANTR
+      LOGICAL :: TEST_QRDMD
+
+!.....external subroutines (BLAS and LAPACK)
+      EXTERNAL DAXPY,  DGEEV, DGEMM, DGEMV, DLACPY, DLASCL
+      EXTERNAL ZGEEV,  ZGEMV, ZLASCL
+      EXTERNAL ZLARNV, ZLATMR
+      EXTERNAL ZAXPY,  ZGEMM
+!.....external subroutines DMD package, part 1
+!     subroutines under test
+      EXTERNAL ZGEDMD, ZGEDMDQ
+!.....external functions (BLAS and LAPACK)
+      EXTERNAL         DLAMCH,  DZNRM2
+      REAL(KIND=WP) :: DLAMCH,  DZNRM2
+      REAL(KIND=WP) ::          ZLANGE
+      EXTERNAL IZAMAX
+      INTEGER  IZAMAX
+      EXTERNAL LSAME
+      LOGICAL  LSAME
+
+      INTRINSIC ABS, INT, MIN, MAX, SIGN
+!............................................................
+
+      ! The test is always in pairs : ( ZGEDMD and ZGEDMDQ )
+      ! because the test includes comparing the results (in pairs).
+!.....................................................................................
+      TEST_QRDMD = .TRUE. ! This code by default performs tests on ZGEDMDQ
+                          ! Since the QR factorizations based algorithm is designed for
+                          ! single trajectory data, only single trajectory tests will
+                          ! be performed with xGEDMDQ.
+      WANTQ = 'Q'
+      WANTR = 'R'
+!.................................................................................
+
+      EPS = DLAMCH( 'P' )  ! machine precision DP
+
+      ! Global counters of failures of some particular tests
+      NFAIL      = 0
+      NFAIL_REZ  = 0
+      NFAIL_REZQ = 0
+      NFAIL_Z_XV = 0
+      NFAIL_F_QR = 0
+      NFAIL_AU   = 0
+      NFAIL_SVDIFF = 0
+      NFAIL_TOTAL  = 0
+      NFAILQ_TOTAL = 0
+
+      DO LLOOP = 1, 4
+
+      WRITE(*,*) 'L Loop Index = ', LLOOP
+
+      ! Set the dimensions of the problem ...
+      WRITE(*,*) 'M = '
+      READ(*,*) M
+      WRITE(*,*) M
+      ! ... and the number of snapshots.
+      WRITE(*,*) 'N = '
+      READ(*,*) N
+      WRITE(*,*) N
+
+      ! ... Test the dimensions
+      IF ( ( MIN(M,N) == 0 ) .OR. ( M < N )  ) THEN
+          WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.'
+          STOP
+      END IF
+!.............
+      ! The seed inside the LLOOP so that each pass can be reproduced easily.
+      ISEED(1) = 4
+      ISEED(2) = 3
+      ISEED(3) = 2
+      ISEED(4) = 1
+
+      LDA  = M
+      LDF  = M
+      LDX  = M
+      LDY  = M
+      LDW  = N
+      LDZ  = M
+      LDAU = M
+      LDS  = N
+
+      TMP_ZXW  = ZERO
+      TMP_AU   = ZERO
+      TMP_REZ  = ZERO
+      TMP_REZQ = ZERO
+      SVDIFF   = ZERO
+      TMP_EX   = ZERO
+
+      ALLOCATE( ZA(LDA,M) )
+      ALLOCATE( ZAC(LDA,M) )
+      ALLOCATE( ZF(LDF,N+1) )
+      ALLOCATE( ZF0(LDF,N+1) )
+      ALLOCATE( ZF1(LDF,N+1) )
+      ALLOCATE( ZX(LDX,N) )
+      ALLOCATE( ZX0(LDX,N) )
+      ALLOCATE( ZY(LDY,N+1) )
+      ALLOCATE( ZY0(LDY,N+1) )
+      ALLOCATE( ZY1(LDY,N+1) )
+      ALLOCATE( ZAU(LDAU,N) )
+      ALLOCATE( ZW(LDW,N) )
+      ALLOCATE( ZS(LDS,N) )
+      ALLOCATE( ZZ(LDZ,N) )
+      ALLOCATE( ZZ1(LDZ,N) )
+      ALLOCATE( RES(N) )
+      ALLOCATE( RES1(N) )
+      ALLOCATE( RESEX(N) )
+      ALLOCATE( ZEIGS(N) )
+      ALLOCATE( SINGVX(N) )
+      ALLOCATE( SINGVQX(N) )
+
+      TOL  = M*EPS
+      ! This mimics O(M*N)*EPS bound for accumulated roundoff error.
+      ! The factor 10 is somewhat arbitrary.
+      TOL2 = 10*M*N*EPS
+
+!.............
+
+      DO K_TRAJ = 1, 2
+      !  Number of intial conditions in the simulation/trajectories (1 or 2)
+
+      COND = 1.0D4
+      ZMAX = (1.0D1,1.0D1)
+      RSIGN = 'F'
+      GRADE = 'N'
+      MODEL = 6
+      CONDL = 1.0D1
+      MODER = 6
+      CONDR = 1.0D1
+      PIVTNG = 'N'
+
+      ! Loop over all parameter MODE values for ZLATMR (+1,..,+6)
+      DO MODE = 1, 6
+
+      ALLOCATE( IWORK(2*M) )
+      ALLOCATE( ZDA(M) )
+      ALLOCATE( ZDL(M) )
+      ALLOCATE( ZDR(M) )
+
+      CALL ZLATMR( M, M, 'N', ISEED, 'N', ZDA, MODE, COND, &
+                   ZMAX, RSIGN, GRADE, ZDL, MODEL,  CONDL, &
+                   ZDR, MODER, CONDR, PIVTNG, IWORK, M, M, &
+                   ZERO, -ONE, 'N', ZA, LDA, IWORK(M+1), INFO )
+      DEALLOCATE( ZDR )
+      DEALLOCATE( ZDL )
+      DEALLOCATE( ZDA )
+      DEALLOCATE( IWORK )
+
+      LZWORK = MAX(1,2*M)
+      ALLOCATE( ZEIGSA(M) )
+      ALLOCATE( ZWORK(LZWORK) )
+      ALLOCATE( WORK(2*M) )
+      ZAC(1:M,1:M) = ZA(1:M,1:M)
+      CALL ZGEEV( 'N','N', M, ZAC, LDA, ZEIGSA, ZDUM2X2, 2, &
+                  ZDUM2X2, 2, ZWORK, LZWORK, WORK, INFO ) ! LAPACK CALL
+      DEALLOCATE(WORK)
+      DEALLOCATE(ZWORK)
+
+      TMP = ABS(ZEIGSA(IZAMAX(M, ZEIGSA, 1))) ! The spectral radius of ZA
+      ! Scale the matrix ZA to have unit spectral radius.
+      CALL ZLASCL( 'G',0, 0, TMP, ONE, M, M, &
+                   ZA, LDA, INFO )
+      CALL ZLASCL( 'G',0, 0, TMP, ONE, M, 1, &
+                   ZEIGSA, M, INFO )
+      ANORM = ZLANGE( 'F', M, M, ZA, LDA, WDUMMY )
+
+      IF ( K_TRAJ == 2 ) THEN
+          ! generate data as two trajectories
+          ! with two inital conditions
+          CALL ZLARNV(2, ISEED, M, ZF(1,1) )
+          DO i = 1, N/2
+             CALL ZGEMV( 'N', M, M, ZONE, ZA, LDA, ZF(1,i), 1,  &
+                  ZZERO, ZF(1,i+1), 1 )
+          END DO
+          ZX0(1:M,1:N/2) = ZF(1:M,1:N/2)
+          ZY0(1:M,1:N/2) = ZF(1:M,2:N/2+1)
+
+          CALL ZLARNV(2, ISEED, M, ZF(1,1) )
+          DO i = 1, N-N/2
+             CALL ZGEMV( 'N', M, M, ZONE, ZA, LDA, ZF(1,i), 1,  &
+                  ZZERO, ZF(1,i+1), 1 )
+          END DO
+          ZX0(1:M,N/2+1:N) = ZF(1:M,1:N-N/2)
+          ZY0(1:M,N/2+1:N) = ZF(1:M,2:N-N/2+1)
+      ELSE
+          CALL ZLARNV(2, ISEED, M, ZF(1,1) )
+          DO i = 1, N
+             CALL ZGEMV( 'N', M, M, ZONE, ZA, M, ZF(1,i), 1,  &
+                  ZZERO, ZF(1,i+1), 1 )
+          END DO
+          ZF0(1:M,1:N+1) = ZF(1:M,1:N+1)
+          ZX0(1:M,1:N) = ZF0(1:M,1:N)
+          ZY0(1:M,1:N) = ZF0(1:M,2:N+1)
+      END IF
+
+      DEALLOCATE( ZEIGSA )
+!........................................................................
+
+      DO iJOBZ = 1, 4
+
+          SELECT CASE ( iJOBZ )
+          CASE(1)
+              JOBZ   = 'V'
+              RESIDS = 'R'
+          CASE(2)
+              JOBZ   = 'V'
+              RESIDS = 'N'
+          CASE(3)
+              JOBZ   = 'F'
+              RESIDS = 'N'
+          CASE(4)
+              JOBZ   = 'N'
+              RESIDS = 'N'
+          END SELECT
+
+      DO iJOBREF = 1, 3
+
+          SELECT CASE ( iJOBREF )
+          CASE(1)
+              JOBREF = 'R'
+          CASE(2)
+              JOBREF = 'E'
+          CASE(3)
+              JOBREF = 'N'
+          END SELECT
+
+      DO iSCALE = 1, 4
+
+          SELECT CASE ( iSCALE )
+          CASE(1)
+              SCALE = 'S'
+          CASE(2)
+              SCALE = 'C'
+          CASE(3)
+              SCALE = 'Y'
+          CASE(4)
+              SCALE = 'N'
+          END SELECT
+
+      DO iNRNK = -1, -2, -1
+         NRNK   = iNRNK
+         NRNKsp = iNRNK
+
+      DO iWHTSVD = 1,  3
+         ! Check all four options to compute the POD basis
+         ! via the SVD.
+         WHTSVD   = iWHTSVD
+         WHTSVDsp = iWHTSVD
+
+      DO LWMINOPT = 1, 2
+         ! Workspace query for the minimal (1) and for the optimal
+         ! (2) workspace lengths determined by workspace query.
+
+      ! ZGEDMD is always tested and its results are also used for
+      ! comparisons with ZGEDMDQ.
+
+      ZX(1:M,1:N) = ZX0(1:M,1:N)
+      ZY(1:M,1:N) = ZY0(1:M,1:N)
+
+      CALL ZGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD,   &
+                   M,  N, ZX, LDX, ZY, LDY, NRNK, TOL,    &
+                   K, ZEIGS, ZZ, LDZ,  RES, ZAU, LDAU,    &
+                   ZW,  LDW, ZS, LDS,  ZDUMMY, -1,        &
+                   WDUMMY, -1, IDUMMY, -1, INFO )
+      IF ( (INFO .EQ. 2) .OR. ( INFO .EQ. 3 ) &
+                          .OR. ( INFO < 0 ) ) THEN
+           WRITE(*,*) 'Call to ZGEDMD workspace query failed. &
+                      &Check the calling sequence and the code.'
+           WRITE(*,*) 'The error code is ', INFO
+           WRITE(*,*) 'The input parameters were ',      &
+           SCALE, JOBZ, RESIDS, JOBREF, WHTSVD,          &
+           M, N, LDX, LDY, NRNK, TOL, LDZ, LDAU, LDW, LDS
+           STOP
+      END IF
+
+      LZWORK = INT(ZDUMMY(LWMINOPT))
+      LWORK  = INT(WDUMMY(1))
+      LIWORK = IDUMMY(1)
+
+      ALLOCATE(ZWORK(LZWORK))
+      ALLOCATE( WORK(LWORK))
+      ALLOCATE(IWORK(LIWORK))
+
+      CALL ZGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD,  &
+                   M,  N, ZX, LDX, ZY, LDY, NRNK, TOL,   &
+                   K, ZEIGS, ZZ, LDZ,  RES, ZAU, LDAU,   &
+                   ZW,  LDW,  ZS, LDS, ZWORK,  LZWORK,   &
+                   WORK, LWORK, IWORK, LIWORK, INFO )
+
+      IF ( INFO /= 0 ) THEN
+           WRITE(*,*) 'Call to ZGEDMD failed. &
+           &Check the calling sequence and the code.'
+           WRITE(*,*) 'The error code is ', INFO
+           WRITE(*,*) 'The input parameters were ',&
+           SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+           M, N, LDX, LDY, NRNK, TOL
+           STOP
+      END IF
+
+      SINGVX(1:N) = WORK(1:N)
+
+      !...... ZGEDMD check point
+      IF ( LSAME(JOBZ,'V')  ) THEN
+          ! Check that Z = X*W, on return from ZGEDMD
+          ! This checks that the returned eigenvectors in Z are
+          ! the product of the SVD'POD basis returned in X
+          ! and the eigenvectors of the rayleigh quotient
+          ! returned in W
+          CALL ZGEMM( 'N', 'N', M, K, K, ZONE, ZX, LDX, ZW, LDW, &
+                      ZZERO, ZZ1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+             CALL ZAXPY( M, -ZONE, ZZ(1,i), 1, ZZ1(1,i), 1)
+             TMP = MAX(TMP, DZNRM2( M, ZZ1(1,i), 1 ) )
+          END DO
+          TMP_ZXW = MAX(TMP_ZXW, TMP )
+          IF ( TMP_ZXW <= 10*M*EPS ) THEN
+              !WRITE(*,*) ' :) .... OK .........ZGEDMD PASSED.'
+          ELSE
+              NFAIL_Z_XV = NFAIL_Z_XV + 1
+              WRITE(*,*) ':( .................ZGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+      END IF
+
+
+      !...... ZGEDMD check point
+      IF ( LSAME(JOBREF,'R') ) THEN
+           ! The matrix A*U is returned for computing refined Ritz vectors.
+           ! Check that A*U is computed correctly using the formula
+           ! A*U = Y * V * inv(SIGMA). This depends on the
+           ! accuracy in the computed singular values and vectors of X.
+           ! See the paper for an error analysis.
+           ! Note that the left singular vectors of the input matrix X
+           ! are returned in the array X.
+           CALL ZGEMM( 'N', 'N', M, K, M, ZONE, ZA, LDA, ZX, LDX, &
+                      ZZERO, ZZ1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+            CALL ZAXPY( M, -ZONE, ZAU(1,i), 1, ZZ1(1,i), 1)
+            TMP = MAX( TMP, DZNRM2( M, ZZ1(1,i),1 ) * &
+                     SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_AU = MAX( TMP_AU, TMP )
+          IF ( TMP <= TOL2 ) THEN
+              !WRITE(*,*) ':) .... OK .........ZGEDMD PASSED.'
+          ELSE
+              NFAIL_AU = NFAIL_AU + 1
+              WRITE(*,*) ':( .................ZGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+      ELSEIF ( LSAME(JOBREF,'E') ) THEN
+       ! The unscaled vectors of the Exact DMD are computed.
+       ! This option is included for the sake of completeness,
+       ! for users who prefer the Exact DMD vectors. The
+       ! returned vectors are in the real form, in the same way
+       ! as the Ritz vectors. Here we just save the vectors
+       ! and test them separately using a Matlab script.
+
+
+       CALL ZGEMM( 'N', 'N', M, K, M, ZONE, ZA, LDA, ZAU, LDAU, ZZERO, ZY1, LDY )
+
+               DO i=1, K
+                  ! have a real eigenvalue with real eigenvector
+                CALL ZAXPY( M, -ZEIGS(i), ZAU(1,i), 1, ZY1(1,i), 1 )
+                RESEX(i) = DZNRM2( M, ZY1(1,i), 1) / DZNRM2(M,ZAU(1,i),1)
+               END DO
+      END IF
+      !...... ZGEDMD check point
+
+      IF ( LSAME(RESIDS, 'R') ) THEN
+          ! Compare the residuals returned by ZGEDMD with the
+          ! explicitly computed residuals using the matrix A.
+          ! Compute explicitly Y1 = A*Z
+          CALL ZGEMM( 'N', 'N', M, K, M, ZONE, ZA, LDA, ZZ, LDZ, ZZERO, ZY1, LDY )
+          ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+          ! of the invariant subspaces that correspond to complex conjugate
+          ! pairs of eigencalues. (See the description of Z in ZGEDMD,)
+
+          DO i=1, K
+                ! have a real eigenvalue with real eigenvector
+                CALL ZAXPY( M, -ZEIGS(i), ZZ(1,i), 1, ZY1(1,i), 1 )
+                RES1(i) = DZNRM2( M, ZY1(1,i), 1)
+          END DO
+          TMP = ZERO
+          DO i = 1, K
+          TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                    SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_REZ = MAX( TMP_REZ, TMP )
+          IF ( TMP <= TOL2 ) THEN
+              !WRITE(*,*) ':) .... OK ..........ZGEDMD PASSED.'
+          ELSE
+              NFAIL_REZ = NFAIL_REZ + 1
+              WRITE(*,*) ':( ..................ZGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+
+
+         IF ( LSAME(JOBREF,'E') ) THEN
+            TMP = ZERO
+          DO i = 1, K
+          TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) )
+          END DO
+          TMP_EX = MAX(TMP_EX,TMP)
+         END IF
+
+      END IF
+
+      DEALLOCATE(ZWORK)
+      DEALLOCATE(WORK)
+      DEALLOCATE(IWORK)
+
+      IF ( TEST_QRDMD .AND. (K_TRAJ == 1) ) THEN
+
+      ZF(1:M,1:N+1) = ZF0(1:M,1:N+1)
+
+      CALL ZGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, &
+                    WHTSVD, M, N+1, ZF, LDF,  ZX, LDX,  ZY, LDY,  &
+                    NRNK,  TOL, K, ZEIGS, ZZ, LDZ, RES,  ZAU,  &
+                    LDAU, ZW, LDW, ZS, LDS, ZDUMMY, -1,   &
+                    WDUMMY,  -1, IDUMMY, -1, INFO )
+
+      LZWORK = INT(ZDUMMY(LWMINOPT))
+      ALLOCATE( ZWORK(LZWORK) )
+      LIWORK = IDUMMY(1)
+      ALLOCATE(IWORK(LIWORK))
+      LWORK = INT(WDUMMY(1))
+      ALLOCATE(WORK(LWORK))
+
+      CALL ZGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, &
+                    WHTSVD, M, N+1, ZF, LDF,  ZX, LDX,  ZY, LDY,  &
+                    NRNK,  TOL, KQ, ZEIGS, ZZ, LDZ, RES,  ZAU,  &
+                    LDAU, ZW, LDW, ZS, LDS, ZWORK, LZWORK,   &
+                    WORK,  LWORK, IWORK, LIWORK, INFO )
+
+      IF ( INFO /= 0 ) THEN
+             WRITE(*,*) 'Call to ZGEDMDQ failed. &
+             &Check the calling sequence and the code.'
+             WRITE(*,*) 'The error code is ', INFO
+             WRITE(*,*) 'The input parameters were ',&
+             SCALE, JOBZ, RESIDS, WANTQ, WANTR, WHTSVD, &
+             M, N, LDX, LDY, NRNK, TOL
+             STOP
+      END IF
+      SINGVQX(1:N) = WORK(1:N)
+
+      !..... ZGEDMDQ check point
+
+          IF ( 1 == 0 ) THEN
+              ! Comparison of ZGEDMD and ZGEDMDQ singular values disabled
+          TMP = ZERO
+          DO i = 1, MIN(K, KQ)
+             TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / &
+                                   SINGVX(1) )
+          END DO
+          SVDIFF = MAX( SVDIFF, TMP )
+          IF ( TMP > M*N*EPS ) THEN
+             WRITE(*,*) 'FAILED! Something was wrong with the run.'
+             NFAIL_SVDIFF = NFAIL_SVDIFF + 1
+             DO j =1, 3
+                 write(*,*) j, SINGVX(j), SINGVQX(j)
+                 read(*,*)
+             END DO
+
+          END IF
+          END IF
+
+      !..... ZGEDMDQ check point
+      IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN
+         ! Check that the QR factors are computed and returned
+         ! as requested. The residual ||F-Q*R||_F / ||F||_F
+         ! is compared to M*N*EPS.
+         ZF1(1:M,1:N+1) = ZF0(1:M,1:N+1)
+         CALL ZGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -ZONE, ZF, &
+                     LDF, ZY, LDY, ZONE, ZF1, LDF )
+         TMP_FQR = ZLANGE( 'F', M, N+1, ZF1, LDF, WORK ) / &
+               ZLANGE( 'F', M, N+1, ZF0,  LDF, WORK )
+         IF ( TMP_FQR > TOL2 ) THEN
+              WRITE(*,*) 'FAILED! Something was wrong with the run.'
+             NFAIL_F_QR = NFAIL_F_QR + 1
+         ELSE
+             !WRITE(*,*) '........ PASSED.'
+         END IF
+      END IF
+
+      !..... ZGEDMDQ check point
+      IF ( LSAME(RESIDS, 'R') ) THEN
+          ! Compare the residuals returned by ZGEDMDQ with the
+          ! explicitly computed residuals using the matrix A.
+          ! Compute explicitly Y1 = A*Z
+          CALL ZGEMM( 'N', 'N', M, KQ, M, ZONE, ZA, LDA, ZZ, LDZ, ZZERO, ZY1, LDY )
+          ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+          ! of the invariant subspaces that correspond to complex conjugate
+          ! pairs of eigencalues. (See the description of Z in ZGEDMDQ)
+
+          DO i=1, KQ
+                ! have a real eigenvalue with real eigenvector
+                CALL ZAXPY( M, -ZEIGS(i), ZZ(1,i), 1, ZY1(1,i), 1 )
+                ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i)
+                RES1(i) = DZNRM2( M, ZY1(1,i), 1)
+          END DO
+          TMP = ZERO
+          DO i = 1, KQ
+          TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+              SINGVQX(KQ)/(ANORM*SINGVQX(1)) )
+          END DO
+          TMP_REZQ = MAX( TMP_REZQ, TMP )
+          IF ( TMP <= TOL2 ) THEN
+              !WRITE(*,*) '.... OK ........ ZGEDMDQ PASSED.'
+          ELSE
+              NFAIL_REZQ = NFAIL_REZQ + 1
+              WRITE(*,*) '................ ZGEDMDQ FAILED!', &
+                  'Check the code for implementation errors.'
+              STOP
+          END IF
+
+      END IF
+
+      DEALLOCATE( ZWORK )
+      DEALLOCATE( WORK  )
+      DEALLOCATE( IWORK )
+
+      END IF ! ZGEDMDQ
+
+!.......................................................................................................
+
+      END DO   ! LWMINOPT
+      !write(*,*) 'LWMINOPT loop completed'
+      END DO   ! iWHTSVD
+      !write(*,*) 'WHTSVD loop completed'
+      END DO   ! iNRNK  -2:-1
+      !write(*,*) 'NRNK loop completed'
+      END DO   ! iSCALE  1:4
+      !write(*,*) 'SCALE loop completed'
+      END DO
+      !write(*,*) 'JOBREF loop completed'
+      END DO   ! iJOBZ
+      !write(*,*) 'JOBZ loop completed'
+
+      END DO ! MODE -6:6
+      !write(*,*) 'MODE loop completed'
+      END DO ! 1 or 2 trajectories
+      !write(*,*) 'trajectories  loop completed'
+
+      DEALLOCATE( ZA )
+      DEALLOCATE( ZAC )
+      DEALLOCATE( ZZ )
+      DEALLOCATE( ZF )
+      DEALLOCATE( ZF0 )
+      DEALLOCATE( ZF1 )
+      DEALLOCATE( ZX )
+      DEALLOCATE( ZX0 )
+      DEALLOCATE( ZY )
+      DEALLOCATE( ZY0 )
+      DEALLOCATE( ZY1 )
+      DEALLOCATE( ZAU )
+      DEALLOCATE( ZW )
+      DEALLOCATE( ZS )
+      DEALLOCATE( ZZ1 )
+      DEALLOCATE( RES )
+      DEALLOCATE( RES1 )
+      DEALLOCATE( RESEX )
+      DEALLOCATE( ZEIGS )
+      DEALLOCATE( SINGVX )
+      DEALLOCATE( SINGVQX )
+
+      END DO ! LLOOP
+
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for ZGEDMD :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+      IF ( NFAIL_Z_XV == 0 ) THEN
+         WRITE(*,*) '>>>> Z - U*V test PASSED.'
+      ELSE
+         WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)'
+         WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_ZXW
+         NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_Z_XV
+      END IF
+      IF ( NFAIL_AU == 0 ) THEN
+        WRITE(*,*) '>>>> A*U test PASSED. '
+      ELSE
+        WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)'
+        WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU
+        WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+        NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU
+      END IF
+
+      IF ( NFAIL_REZ == 0 ) THEN
+        WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+        WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)'
+        WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ
+        WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+        NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ
+      END IF
+
+      IF ( NFAIL_TOTAL == 0 ) THEN
+        WRITE(*,*) '>>>> ZGEDMD :: ALL TESTS PASSED.'
+      ELSE
+        WRITE(*,*) NFAIL_TOTAL, 'FAILURES!'
+        WRITE(*,*) '>>>>>>>>>>>>>> ZGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      IF ( TEST_QRDMD ) THEN
+      WRITE(*,*)
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for ZGEDMDQ :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+
+      IF ( NFAIL_SVDIFF == 0 ) THEN
+          WRITE(*,*) '>>>> ZGEDMD and ZGEDMDQ computed singular &
+              &values test PASSED.'
+      ELSE
+         WRITE(*,*) 'ZGEDMD and ZGEDMDQ discrepancies in &
+             &the singular values unacceptable ', &
+             NFAIL_SVDIFF, ' times. Test FAILED.'
+         WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF
+         WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+         NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF
+      END IF
+
+      IF ( NFAIL_F_QR == 0 ) THEN
+          WRITE(*,*) '>>>> F - Q*R test PASSED.'
+      ELSE
+          WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)'
+          WRITE(*,*) 'The largest relative residual was ', TMP_FQR
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR
+      END IF
+
+      IF ( NFAIL_REZQ == 0 ) THEN
+          WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+          WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)'
+          WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ
+      END IF
+
+      IF ( NFAILQ_TOTAL == 0 ) THEN
+          WRITE(*,*) '>>>>>>> ZGEDMDQ :: ALL TESTS PASSED.'
+      ELSE
+         WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!'
+         WRITE(*,*) '>>>>>>> ZGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      END IF
+
+      WRITE(*,*)
+      WRITE(*,*) 'Test completed.'
+      STOP
+      END
diff --git a/lapack-netlib/TESTING/EIG/zerrst.f b/lapack-netlib/TESTING/EIG/zerrst.f
index d7b41c053..31881c4de 100644
--- a/lapack-netlib/TESTING/EIG/zerrst.f
+++ b/lapack-netlib/TESTING/EIG/zerrst.f
@@ -748,17 +748,17 @@
          CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK )
          INFOT = 18
          CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ),
+     $                M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N-1, IW( 2*N-1 ),
+     $                M, R, Z, 1, IW, Q, 2*N, RW, 0, IW( 2*N-1 ),
      $                10*N, INFO )
          CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK )
          INFOT = 22
          CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 10*N-1,
+     $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 0,
      $                INFO )
          CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK )
          NT = NT + 12
@@ -830,19 +830,19 @@
          INFOT = 18
          CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
      $                0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ),
+     $                M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
      $                0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ),
+     $                M, R, Z, 1, IW, Q, 26*N, RW, 0, IW( 2*N-1 ),
      $                10*N, INFO )
          CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 22
          CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
      $                0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1,
+     $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 0,
      $                INFO )
          CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          NT = NT + 13
diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt
index 676857a80..143fd0597 100644
--- a/lapack-netlib/TESTING/LIN/CMakeLists.txt
+++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt
@@ -9,7 +9,7 @@ set(DZLNTST dlaord.f)
 set(SLINTST schkaa.F
    schkeq.f schkgb.f schkge.f schkgt.f
    schklq.f schkpb.f schkpo.f schkps.f schkpp.f
-   schkpt.f schkq3.f schkql.f schkqr.f schkrq.f
+   schkpt.f schkq3.f schkqp3rk.f schkql.f schkqr.f schkrq.f
    schksp.f schksy.f schksy_rook.f schksy_rk.f 
    schksy_aa.f schksy_aa_2stage.f 
    schktb.f schktp.f schktr.f
@@ -56,7 +56,7 @@ set(CLINTST cchkaa.F
    cchkhe.f cchkhe_rook.f cchkhe_rk.f 
    cchkhe_aa.f cchkhe_aa_2stage.f
    cchkhp.f cchklq.f cchkpb.f
-   cchkpo.f cchkps.f cchkpp.f cchkpt.f cchkq3.f cchkql.f
+   cchkpo.f cchkps.f cchkpp.f cchkpt.f cchkq3.f cchkqp3rk.f cchkql.f
    cchkqr.f cchkrq.f cchksp.f cchksy.f cchksy_rook.f cchksy_rk.f
    cchksy_aa.f cchksy_aa_2stage.f
    cchktb.f
@@ -110,7 +110,7 @@ endif()
 set(DLINTST dchkaa.F
    dchkeq.f dchkgb.f dchkge.f dchkgt.f
    dchklq.f dchkpb.f dchkpo.f dchkps.f dchkpp.f
-   dchkpt.f dchkq3.f dchkql.f dchkqr.f dchkrq.f
+   dchkpt.f dchkq3.f dchkqp3rk.f dchkql.f dchkqr.f dchkrq.f
    dchksp.f dchksy.f dchksy_rook.f dchksy_rk.f 
    dchksy_aa.f dchksy_aa_2stage.f
    dchktb.f dchktp.f dchktr.f
@@ -158,7 +158,7 @@ set(ZLINTST zchkaa.F
    zchkhe.f zchkhe_rook.f zchkhe_rk.f 
    zchkhe_aa.f zchkhe_aa_2stage.f
    zchkhp.f zchklq.f zchkpb.f
-   zchkpo.f zchkps.f zchkpp.f zchkpt.f zchkq3.f zchkql.f
+   zchkpo.f zchkps.f zchkpp.f zchkpt.f zchkq3.f zchkqp3rk.f zchkql.f
    zchkqr.f zchkrq.f zchksp.f zchksy.f zchksy_rook.f zchksy_rk.f
    zchksy_aa.f  zchksy_aa_2stage.f
    zchktb.f
diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile
index 64abc4dba..714efa52a 100644
--- a/lapack-netlib/TESTING/LIN/Makefile
+++ b/lapack-netlib/TESTING/LIN/Makefile
@@ -45,7 +45,7 @@ DZLNTST = dlaord.o
 SLINTST = schkaa.o \
    schkeq.o schkgb.o schkge.o schkgt.o \
    schklq.o schkpb.o schkpo.o schkps.o schkpp.o \
-   schkpt.o schkq3.o schkql.o schkqr.o schkrq.o \
+   schkpt.o schkq3.o schkqp3rk.o schkql.o schkqr.o schkrq.o \
    schksp.o schksy.o schksy_rook.o schksy_rk.o  \
    schksy_aa.o schksy_aa_2stage.o schktb.o schktp.o schktr.o \
    schktz.o \
@@ -89,7 +89,7 @@ CLINTST = cchkaa.o \
    cchkeq.o cchkgb.o cchkge.o cchkgt.o \
    cchkhe.o cchkhe_rook.o cchkhe_rk.o  \
    cchkhe_aa.o cchkhe_aa_2stage.o cchkhp.o cchklq.o cchkpb.o \
-   cchkpo.o cchkps.o cchkpp.o cchkpt.o cchkq3.o cchkql.o \
+   cchkpo.o cchkps.o cchkpp.o cchkpt.o cchkq3.o cchkqp3rk.o cchkql.o \
    cchkqr.o cchkrq.o cchksp.o cchksy.o cchksy_rook.o cchksy_rk.o \
    cchksy_aa.o cchksy_aa_2stage.o cchktb.o \
    cchktp.o cchktr.o cchktz.o \
@@ -137,7 +137,7 @@ endif
 DLINTST = dchkaa.o \
    dchkeq.o dchkgb.o dchkge.o dchkgt.o \
    dchklq.o dchkpb.o dchkpo.o dchkps.o dchkpp.o \
-   dchkpt.o dchkq3.o dchkql.o dchkqr.o dchkrq.o \
+   dchkpt.o dchkq3.o dchkqp3rk.o dchkql.o dchkqr.o dchkrq.o \
    dchksp.o dchksy.o dchksy_rook.o dchksy_rk.o  \
    dchksy_aa.o dchksy_aa_2stage.o dchktb.o dchktp.o dchktr.o \
    dchktz.o \
@@ -182,7 +182,7 @@ ZLINTST = zchkaa.o \
    zchkeq.o zchkgb.o zchkge.o zchkgt.o \
    zchkhe.o zchkhe_rook.o zchkhe_rk.o zchkhe_aa.o zchkhe_aa_2stage.o \
    zchkhp.o zchklq.o zchkpb.o \
-   zchkpo.o zchkps.o zchkpp.o zchkpt.o zchkq3.o zchkql.o \
+   zchkpo.o zchkps.o zchkpp.o zchkpt.o zchkq3.o zchkqp3rk.o zchkql.o \
    zchkqr.o zchkrq.o zchksp.o zchksy.o zchksy_rook.o zchksy_rk.o \
    zchksy_aa.o zchksy_aa_2stage.o zchktb.o \
    zchktp.o zchktr.o zchktz.o \
diff --git a/lapack-netlib/TESTING/LIN/alaerh.f b/lapack-netlib/TESTING/LIN/alaerh.f
index 1845888a6..6c8a47f1e 100644
--- a/lapack-netlib/TESTING/LIN/alaerh.f
+++ b/lapack-netlib/TESTING/LIN/alaerh.f
@@ -797,6 +797,18 @@
             WRITE( NOUT, FMT = 9978 )
      $     SUBNAM(1:LEN_TRIM( SUBNAM )), INFO, M, N, IMAT
          END IF
+*
+      ELSE IF( LSAMEN( 2, P2, 'QK' ) ) THEN
+*
+*        xQK:  truncated QR factorization with pivoting
+*
+         IF( LSAMEN( 7, SUBNAM( 2: 8 ), 'GEQP3RK' )  ) THEN
+            WRITE( NOUT, FMT = 9930 )
+     $     SUBNAM(1:LEN_TRIM( SUBNAM )), INFO, M, N, KL, N5, IMAT
+         ELSE IF( LSAMEN( 5, SUBNAM( 2: 6 ), 'LATMS' ) ) THEN
+            WRITE( NOUT, FMT = 9978 )
+     $     SUBNAM(1:LEN_TRIM( SUBNAM )), INFO, M, N, IMAT
+         END IF
 *
       ELSE IF( LSAMEN( 2, P2, 'LQ' ) ) THEN
 *
@@ -1147,6 +1159,11 @@
 *     What we do next
 *
  9949 FORMAT( ' ==> Doing only the condition estimate for this case' )
+*
+*     SUBNAM, INFO, M, N, NB, IMAT
+*
+ 9930 FORMAT( ' *** Error code from ', A, '=', I5, / ' ==> M =', I5,
+     $      ', N =', I5, ', NX =', I5, ', NB =', I4, ', type ', I2 )
 *
       RETURN
 *
diff --git a/lapack-netlib/TESTING/LIN/alahd.f b/lapack-netlib/TESTING/LIN/alahd.f
index dd75394b3..8f966c584 100644
--- a/lapack-netlib/TESTING/LIN/alahd.f
+++ b/lapack-netlib/TESTING/LIN/alahd.f
@@ -584,13 +584,27 @@
 *
 *        QR decomposition with column pivoting
 *
-         WRITE( IOUNIT, FMT = 9986 )PATH
+         WRITE( IOUNIT, FMT = 8006 )PATH
          WRITE( IOUNIT, FMT = 9969 )
          WRITE( IOUNIT, FMT = '( '' Test ratios:'' )' )
          WRITE( IOUNIT, FMT = 9940 )1
          WRITE( IOUNIT, FMT = 9939 )2
          WRITE( IOUNIT, FMT = 9938 )3
          WRITE( IOUNIT, FMT = '( '' Messages:'' )' )
+*
+      ELSE IF( LSAMEN( 2, P2, 'QK' ) ) THEN
+*
+*        truncated QR decomposition with column pivoting
+*
+         WRITE( IOUNIT, FMT = 8006 )PATH
+         WRITE( IOUNIT, FMT = 9871 )
+         WRITE( IOUNIT, FMT = '( '' Test ratios:'' )' )
+         WRITE( IOUNIT, FMT = 8060 )1
+         WRITE( IOUNIT, FMT = 8061 )2
+         WRITE( IOUNIT, FMT = 8062 )3
+         WRITE( IOUNIT, FMT = 8063 )4
+         WRITE( IOUNIT, FMT = 8064 )5
+         WRITE( IOUNIT, FMT = '( '' Messages:'' )' )
 *
       ELSE IF( LSAMEN( 2, P2, 'TZ' ) ) THEN
 *
@@ -779,6 +793,8 @@
      $       'tall-skinny or short-wide matrices' )
  8005 FORMAT( / 1X, A3, ':  Householder reconstruction from TSQR',
      $       ' factorization output ', /,' for tall-skinny matrices.' )
+ 8006 FORMAT( / 1X, A3, ':  truncated QR factorization',
+     $        ' with column pivoting' )
 *
 *     GE matrix types
 *
@@ -922,6 +938,36 @@
      $      / 4X, '3. Geometric distribution', 10X,
      $      '6. Every second column fixed' )
 *
+*     QK matrix types
+*
+ 9871 FORMAT( 4X, ' 1. Zero matrix', /
+     $        4X, ' 2. Random, Diagonal, CNDNUM = 2', /
+     $        4X, ' 3. Random, Upper triangular, CNDNUM = 2', /
+     $        4X, ' 4. Random, Lower triangular, CNDNUM = 2', /
+     $        4X, ' 5. Random, First column is zero, CNDNUM = 2', /
+     $        4X, ' 6. Random, Last MINMN column is zero, CNDNUM = 2', /
+     $        4X, ' 7. Random, Last N column is zero, CNDNUM = 2', /
+     $        4X, ' 8. Random, Middle column in MINMN is zero,',
+     $               ' CNDNUM = 2', /
+     $        4X, ' 9. Random, First half of MINMN columns are zero,',
+     $                 ' CNDNUM = 2', /
+     $        4X, '10. Random, Last columns are zero starting from',
+     $                 ' MINMN/2+1, CNDNUM = 2', /
+     $        4X, '11. Random, Half MINMN columns in the middle are',
+     $                 ' zero starting from MINMN/2-(MINMN/2)/2+1,'
+     $                 ' CNDNUM = 2', /
+     $        4X, '12. Random, Odd columns are ZERO, CNDNUM = 2', /
+     $        4X, '13. Random, Even columns are ZERO, CNDNUM = 2', /
+     $        4X, '14. Random, CNDNUM = 2', /
+     $        4X, '15. Random, CNDNUM = sqrt(0.1/EPS)', /
+     $        4X, '16. Random, CNDNUM = 0.1/EPS', /
+     $        4X, '17. Random, CNDNUM = 0.1/EPS,',
+     $                 ' one small singular value S(N)=1/CNDNUM', /
+     $        4X, '18. Random, CNDNUM = 2, scaled near underflow,',
+     $                 ' NORM = SMALL = SAFMIN', /
+     $        4X, '19. Random, CNDNUM = 2, scaled near overflow,',
+     $            ' NORM = LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) )' )
+*
 *     TZ matrix types
 *
  9968 FORMAT( ' Matrix types (2-3 have condition 1/EPS):', / 4X,
@@ -1030,9 +1076,8 @@
      $      ' * norm(C) * EPS )' )
  9940 FORMAT( 3X, I2, ': norm(svd(A) - svd(R)) / ',
      $      '( M * norm(svd(R)) * EPS )' )
- 9939 FORMAT( 3X, I2, ': norm( A*P - Q*R )     / ( M * norm(A) * EPS )'
-     $       )
- 9938 FORMAT( 3X, I2, ': norm( I - Q''*Q )      / ( M * EPS )' )
+ 9939 FORMAT( 3X, I2, ': norm( A*P - Q*R ) / ( M * norm(A) * EPS )')
+ 9938 FORMAT( 3X, I2, ': norm( I - Q''*Q ) / ( M * EPS )' )
  9937 FORMAT( 3X, I2, ': norm( A - R*Q )       / ( M * norm(A) * EPS )'
      $       )
  9935 FORMAT( 3X, I2, ': norm( B - A * X )   / ',
@@ -1105,6 +1150,15 @@
  8054 FORMAT(3X,I2,': norm( C*Q - C*Q ) / ( M * norm(C) * EPS )' )
  8055 FORMAT(3X,I2,': norm( C*Q'' - C*Q'' ) / ( M * norm(C) * EPS )')
 
+ 8060 FORMAT( 3X, I2, ': 2-norm(svd(A) - svd(R)) / ',
+     $      '( max(M,N) * 2-norm(svd(R)) * EPS )' )
+ 8061 FORMAT( 3X, I2, ': 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A)',
+     $                ' * EPS )')
+ 8062 FORMAT( 3X, I2, ': 1-norm( I - Q''*Q ) / ( M * EPS )' )
+ 8063 FORMAT( 3X, I2, ': Returns 1.0D+100, if abs(R(K+1,K+1))',
+     $                 ' > abs(R(K,K)), where K=1:KFACT-1' )
+ 8064 FORMAT( 3X, I2, ': 1-norm(Q**T * B - Q**T * B ) / ( M * EPS )')
+
 *
       RETURN
 *
diff --git a/lapack-netlib/TESTING/LIN/alareq.f b/lapack-netlib/TESTING/LIN/alareq.f
index db18775eb..3f057fa48 100644
--- a/lapack-netlib/TESTING/LIN/alareq.f
+++ b/lapack-netlib/TESTING/LIN/alareq.f
@@ -28,12 +28,12 @@
 *> to evaluate the input line which requested NMATS matrix types for
 *> PATH.  The flow of control is as follows:
 *>
-*> If NMATS = NTYPES then
+*> IF NMATS = NTYPES THEN
 *>    DOTYPE(1:NTYPES) = .TRUE.
-*> else
+*> ELSE
 *>    Read the next input line for NMATS matrix types
 *>    Set DOTYPE(I) = .TRUE. for each valid type I
-*> endif
+*> END IF
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/TESTING/LIN/cchkaa.F b/lapack-netlib/TESTING/LIN/cchkaa.F
index ec1534ed4..57d95c741 100644
--- a/lapack-netlib/TESTING/LIN/cchkaa.F
+++ b/lapack-netlib/TESTING/LIN/cchkaa.F
@@ -69,6 +69,7 @@
 *> CLQ    8               List types on next line if 0 < NTYPES <  8
 *> CQL    8               List types on next line if 0 < NTYPES <  8
 *> CQP    6               List types on next line if 0 < NTYPES <  6
+*> ZQK   19               List types on next line if 0 < NTYPES < 19
 *> CTZ    3               List types on next line if 0 < NTYPES <  3
 *> CLS    6               List types on next line if 0 < NTYPES <  6
 *> CEQ
@@ -153,12 +154,11 @@
      $                   NBVAL( MAXIN ), NBVAL2( MAXIN ),
      $                   NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ),
      $                   RANKVAL( MAXIN ), PIV( NMAX )
-      REAL               S( 2*NMAX )
-      COMPLEX            E( NMAX )
 *     ..
 *     .. Allocatable Arrays ..
       INTEGER AllocateStatus
-      REAL, DIMENSION(:), ALLOCATABLE :: RWORK
+      REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S
+      COMPLEX, DIMENSION(:), ALLOCATABLE :: E
       COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK
 *     ..
 *     .. External Functions ..
@@ -170,14 +170,14 @@
       EXTERNAL           ALAREQ, CCHKEQ, CCHKGB, CCHKGE, CCHKGT, CCHKHE,
      $                   CCHKHE_ROOK, CCHKHE_RK, CCHKHE_AA, CCHKHP,
      $                   CCHKLQ, CCHKUNHR_COL, CCHKPB, CCHKPO, CCHKPS,
-     $                   CCHKPP, CCHKPT, CCHKQ3, CCHKQL, CCHKQR, CCHKRQ,
-     $                   CCHKSP, CCHKSY, CCHKSY_ROOK, CCHKSY_RK,
-     $                   CCHKSY_AA, CCHKTB,  CCHKTP, CCHKTR, CCHKTZ,
-     $                   CDRVGB, CDRVGE, CDRVGT, CDRVHE, CDRVHE_ROOK,
-     $                   CDRVHE_RK, CDRVHE_AA, CDRVHP, CDRVLS, CDRVPB,
-     $                   CDRVPO, CDRVPP, CDRVPT, CDRVSP, CDRVSY,
-     $                   CDRVSY_ROOK, CDRVSY_RK, CDRVSY_AA, ILAVER,
-     $                   CCHKQRT, CCHKQRTP
+     $                   CCHKPP, CCHKPT, CCHKQ3, CCHKQP3RK, CCHKQL,
+     $                   CCHKQR, CCHKRQ, CCHKSP, CCHKSY, CCHKSY_ROOK,
+     $                   CCHKSY_RK, CCHKSY_AA, CCHKTB, CCHKTP, CCHKTR,
+     $                   CCHKTZ, CDRVGB, CDRVGE, CDRVGT, CDRVHE,
+     $                   CDRVHE_ROOK, CDRVHE_RK, CDRVHE_AA, CDRVHP,
+     $                   CDRVLS, CDRVPB, CDRVPO, CDRVPP, CDRVPT, CDRVSP,
+     $                   CDRVSY, CDRVSY_ROOK, CDRVSY_RK, CDRVSY_AA,
+     $                   ILAVER, CCHKQRT, CCHKQRTP
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -203,6 +203,10 @@
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
       ALLOCATE ( WORK( NMAX, NMAX+MAXRHS+10 ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( E( NMAX ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus)
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
       ALLOCATE ( RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
 *     ..
@@ -1109,6 +1113,23 @@
          ELSE
             WRITE( NOUT, FMT = 9989 )PATH
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        QK: truncated QR factorization with pivoting
+*
+         NTYPES = 19
+         CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
+*
+         IF( TSTCHK ) THEN
+            CALL CCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                   NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ),
+     $                   A( 1, 2 ), B( 1, 1 ), B( 1, 2 ),
+     $                   S( 1 ), B( 1, 4 ),
+     $                   WORK, RWORK, IWORK, NOUT )
+         ELSE
+            WRITE( NOUT, FMT = 9989 )PATH
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN
 *
@@ -1211,6 +1232,8 @@
 *
       DEALLOCATE (A, STAT = AllocateStatus)
       DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (E, STAT = AllocateStatus)
+      DEALLOCATE (S, STAT = AllocateStatus)
       DEALLOCATE (WORK, STAT = AllocateStatus)
       DEALLOCATE (RWORK,  STAT = AllocateStatus)
 *
diff --git a/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f
index 30a61261f..d79978e55 100644
--- a/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f
@@ -433,9 +433,9 @@
 *                 block factorization, LWORK is the length of AINV.
 *
                   SRNAMT = 'CHETRF_AA_2STAGE'
-                  LWORK = MIN(N*NB, 3*NMAX*NMAX)
-                  CALL CHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, 
-     $                                   AINV, (3*NB+1)*N, 
+                  LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX)
+                  CALL CHETRF_AA_2STAGE( UPLO, N, AFAC, LDA,
+     $                                   AINV, MAX( 1, (3*NB+1)*N ),
      $                                   IWORK, IWORK( 1+N ),
      $                                   WORK, LWORK,
      $                                   INFO )
@@ -517,7 +517,6 @@ c                  NT = 1
                      CALL CLACPY( 'Full', N, NRHS, B, LDA, X, LDA )
 *
                      SRNAMT = 'CHETRS_AA_2STAGE'
-                     LWORK = MAX( 1, 3*N-2 )
                      CALL CHETRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
      $                            AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ),
      $                            X, LDA, INFO )
diff --git a/lapack-netlib/TESTING/LIN/cchkqp3rk.f b/lapack-netlib/TESTING/LIN/cchkqp3rk.f
new file mode 100644
index 000000000..79d6add72
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/cchkqp3rk.f
@@ -0,0 +1,836 @@
+*> \brief \b CCHKQP3RK
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE CCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+*     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+*     $                      B, COPYB, S, TAU,
+*     $                      WORK, RWORK, IWORK, NOUT )
+*      IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER            NM, NN, NNB, NOUT
+*       REAL               THRESH
+*       ..
+*       .. Array Arguments ..
+*       LOGICAL            DOTYPE( * )
+*       INTEGER            IWORK( * ), MVAL( * ), NBVAL( * ), NVAL( * ),
+*      $                   NXVAL( * )
+*       REAL               S( * ), RWORK( * )
+*       COMPLEX            A( * ), COPYA( * ), TAU( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CCHKQP3RK tests CGEQP3RK.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] DOTYPE
+*> \verbatim
+*>          DOTYPE is LOGICAL array, dimension (NTYPES)
+*>          The matrix types to be used for testing.  Matrices of type j
+*>          (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) =
+*>          .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used.
+*> \endverbatim
+*>
+*> \param[in] NM
+*> \verbatim
+*>          NM is INTEGER
+*>          The number of values of M contained in the vector MVAL.
+*> \endverbatim
+*>
+*> \param[in] MVAL
+*> \verbatim
+*>          MVAL is INTEGER array, dimension (NM)
+*>          The values of the matrix row dimension M.
+*> \endverbatim
+*>
+*> \param[in] NN
+*> \verbatim
+*>          NN is INTEGER
+*>          The number of values of N contained in the vector NVAL.
+*> \endverbatim
+*>
+*> \param[in] NVAL
+*> \verbatim
+*>          NVAL is INTEGER array, dimension (NN)
+*>          The values of the matrix column dimension N.
+*> \endverbatim
+*>
+*> \param[in] NNS
+*> \verbatim
+*>          NNS is INTEGER
+*>          The number of values of NRHS contained in the vector NSVAL.
+*> \endverbatim
+*>
+*> \param[in] NSVAL
+*> \verbatim
+*>          NSVAL is INTEGER array, dimension (NNS)
+*>          The values of the number of right hand sides NRHS.
+*> \endverbatim
+*> \param[in] NNB
+*> \verbatim
+*>          NNB is INTEGER
+*>          The number of values of NB and NX contained in the
+*>          vectors NBVAL and NXVAL.  The blocking parameters are used
+*>          in pairs (NB,NX).
+*> \endverbatim
+*>
+*> \param[in] NBVAL
+*> \verbatim
+*>          NBVAL is INTEGER array, dimension (NNB)
+*>          The values of the blocksize NB.
+*> \endverbatim
+*>
+*> \param[in] NXVAL
+*> \verbatim
+*>          NXVAL is INTEGER array, dimension (NNB)
+*>          The values of the crossover point NX.
+*> \endverbatim
+*>
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is REAL
+*>          The threshold value for the test ratios.  A result is
+*>          included in the output file if RESULT >= THRESH.  To have
+*>          every test ratio printed, use THRESH = 0.
+*> \endverbatim
+*>
+*> \param[out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (MMAX*NMAX)
+*>          where MMAX is the maximum value of M in MVAL and NMAX is the
+*>          maximum value of N in NVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYA
+*> \verbatim
+*>          COPYA is COMPLEX array, dimension (MMAX*NMAX)
+*> \endverbatim
+*>
+*> \param[out] B
+*> \verbatim
+*>          B is COMPLEX array, dimension (MMAX*NSMAX)
+*>          where MMAX is the maximum value of M in MVAL and NSMAX is the
+*>          maximum value of NRHS in NSVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYB
+*> \verbatim
+*>          COPYB is COMPLEX array, dimension (MMAX*NSMAX)
+*> \endverbatim
+*>
+*> \param[out] S
+*> \verbatim
+*>          S is REAL array, dimension
+*>                      (min(MMAX,NMAX))
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX array, dimension (MMAX)
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension
+*>                      (max(M*max(M,N) + 4*min(M,N) + max(M,N)))
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (4*NMAX)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (2*NMAX)
+*> \endverbatim
+*>
+*> \param[in] NOUT
+*> \verbatim
+*>          NOUT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex_lin
+*
+*  =====================================================================
+      SUBROUTINE CCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+     $                      B, COPYB, S, TAU,
+     $                      WORK, RWORK, IWORK, NOUT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            NM, NN, NNB, NNS, NOUT
+      REAL               THRESH
+*     ..
+*     .. Array Arguments ..
+      LOGICAL            DOTYPE( * )
+      INTEGER            IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ),
+     $                   NSVAL( * ), NXVAL( * )
+      REAL               S( * ), RWORK( * )
+      COMPLEX            A( * ), COPYA( * ), B( * ), COPYB( * ),
+     $                   TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NTYPES
+      PARAMETER          ( NTYPES = 19 )
+      INTEGER            NTESTS
+      PARAMETER          ( NTESTS = 5 )
+      REAL               ONE, ZERO, BIGNUM
+      COMPLEX            CONE, CZERO
+      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0,
+     $                   CZERO = ( 0.0E+0, 0.0E+0 ),
+     $                   CONE = ( 1.0E+0, 0.0E+0 ),
+     $                   BIGNUM = 1.0E+38 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          DIST, TYPE
+      CHARACTER*3        PATH
+      INTEGER            I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO,
+     $                   INB, IND_OFFSET_GEN,
+     $                   IND_IN, IND_OUT, INS, INFO,
+     $                   ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO,
+     $                   KFACT, KL, KMAX, KU, LDA, LW, LWORK,
+     $                   LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N,
+     $                   NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS,
+     $                   NRUN, NX, T
+      REAL               ANORM, CNDNUM, EPS, ABSTOL, RELTOL,
+     $                   DTEMP, MAXC2NRMK, RELMAXC2NRMK
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 ), ISEEDY( 4 )
+      REAL               RESULT( NTESTS ), RDUMMY( 1 )
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH, CQPT01, CQRT11, CQRT12, CLANGE
+      EXTERNAL           SLAMCH, CQPT01, CQRT11, CQRT12, CLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAERH, ALAHD, ALASUM, SLAORD, ICOPY, CAXPY,
+     $                   XLAENV, CGEQP3RK, CLACPY, CLASET, CLATB4,
+     $                   CLATMS, CUNMQR, CSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, MOD, REAL
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER*32       SRNAMT
+      INTEGER            INFOT, IOUNIT, CUNMQR_LWORK
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, IOUNIT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA               ISEEDY / 1988, 1989, 1990, 1991 /
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize constants and the random number seed.
+*
+      PATH( 1: 1 ) = 'Complex precision'
+      PATH( 2: 3 ) = 'QK'
+      NRUN = 0
+      NFAIL = 0
+      NERRS = 0
+      DO  I = 1, 4
+         ISEED( I ) = ISEEDY( I )
+      END DO
+      EPS = SLAMCH( 'Epsilon' )
+      INFOT = 0
+*
+      DO IM = 1, NM
+*
+*        Do for each value of M in MVAL.
+*
+         M = MVAL( IM )
+         LDA = MAX( 1, M )
+*
+         DO IN = 1, NN
+*
+*           Do for each value of N in NVAL.
+*
+            N = NVAL( IN )
+            MINMN = MIN( M, N )
+            LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ),
+     $                   M*N + 2*MINMN + 4*N )
+*
+            DO INS = 1, NNS
+               NRHS = NSVAL( INS )
+*
+*                 Set up parameters with CLATB4 and generate
+*                 M-by-NRHS B matrix with CLATMS.
+*                 IMAT = 14:
+*                 Random matrix, CNDNUM = 2, NORM = ONE,
+*                 MODE = 3 (geometric distribution of singular values).
+*
+                  CALL CLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'CLATMS'
+                  CALL CLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYB, LDA, WORK, INFO )
+*
+*                 Check error code from CLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'CLATMS', INFO, 0, ' ', M,
+     $                            NRHS, -1, -1, -1, 6, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+               DO IMAT = 1, NTYPES
+*
+*              Do the tests only if DOTYPE( IMAT ) is true.
+*
+               IF( .NOT.DOTYPE( IMAT ) )
+     $            CYCLE
+*
+*              The type of distribution used to generate the random
+*              eigen-/singular values:
+*              ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 )
+*
+*           Do for each type of NON-SYMMETRIC matrix:                               CNDNUM                     NORM                                     MODE
+*            1. Zero matrix
+*            2. Random, Diagonal, CNDNUM = 2                                        CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            3. Random, Upper triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            4. Random, Lower triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            5. Random, First column is zero, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            6. Random, Last MINMN column is zero, CNDNUM = 2                       CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            7. Random, Last N column is zero, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            8. Random, Middle column in MINMN is zero, CNDNUM = 2                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            9. Random, First half of MINMN columns are zero, CNDNUM = 2            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2   CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           11. Random, Half MINMN columns in the middle are zero starting
+*                  from  MINMN/2-(MINMN/2)/2+1, CNDNUM = 2                          CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           12. Random, Odd columns are ZERO, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           13. Random, Even columns are ZERO, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           14. Random, CNDNUM = 2                                                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           15. Random, CNDNUM = sqrt(0.1/EPS)                                      CNDNUM = BADC1 = sqrt(0.1/EPS)  ONE                                      3 ( geometric distribution of singular values )
+*           16. Random, CNDNUM = 0.1/EPS                                            CNDNUM = BADC2 = 0.1/EPS        ONE                                      3 ( geometric distribution of singular values )
+*           17. Random, CNDNUM = 0.1/EPS,                                           CNDNUM = BADC2 = 0.1/EPS        ONE                                      2 ( one small singular value, S(N)=1/CNDNUM )
+*                 one small singular value S(N)=1/CNDNUM
+*           18. Random, CNDNUM = 2, scaled near underflow                           CNDNUM = 2                      SMALL = SAFMIN
+*           19. Random, CNDNUM = 2, scaled near overflow                            CNDNUM = 2                      LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) )  3 ( geometric distribution of singular values )
+*
+               IF( IMAT.EQ.1 ) THEN
+*
+*                 Matrix 1: Zero matrix
+*
+                  CALL CLASET( 'Full', M, N, CZERO, CZERO, COPYA, LDA )
+                  DO I = 1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 )
+     $                  .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN
+*
+*                 Matrices 2-5.
+*
+*                 Set up parameters with DLATB4 and generate a test
+*                 matrix with CLATMS.
+*
+                  CALL CLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'CLATMS'
+                  CALL CLATMS( M, N, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYA, LDA, WORK, INFO )
+*
+*                 Check error code from CLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'CLATMS', INFO, 0, ' ', M, N,
+     $                            -1, -1, -1, IMAT, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+                  CALL SLAORD( 'Decreasing', MINMN, S, 1 )
+*
+               ELSE IF( MINMN.GE.2
+     $                  .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN
+*
+*                 Rectangular matrices 5-13 that contain zero columns,
+*                 only for matrices MINMN >=2.
+*
+*                 JB_ZERO is the column index of ZERO block.
+*                 NB_ZERO is the column block size of ZERO block.
+*                 NB_GEN is the column blcok size of the
+*                 generated block.
+*                 J_INC in the non_zero column index increment
+*                 for matrix 12 and 13.
+*                 J_FIRS_NZ is the index of the first non-zero
+*                 column.
+*
+                  IF( IMAT.EQ.5 ) THEN
+*
+*                    First column is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.6 ) THEN
+*
+*                    Last column MINMN is zero.
+*
+                     JB_ZERO = MINMN
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.7 ) THEN
+*
+*                    Last column N is zero.
+*
+                     JB_ZERO = N
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.8 ) THEN
+*
+*                    Middle column in MINMN is zero.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.9 ) THEN
+*
+*                    First half of MINMN columns is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.10 ) THEN
+*
+*                    Last columns are zero columns,
+*                    starting from (MINMN / 2 + 1) column.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = N - JB_ZERO + 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.11 ) THEN
+*
+*                    Half of the columns in the middle of MINMN
+*                    columns is zero, starting from
+*                    MINMN/2 - (MINMN/2)/2 + 1 column.
+*
+                     JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.12 ) THEN
+*
+*                    Odd-numbered columns are zero,
+*
+                     NB_GEN = N / 2
+                     NB_ZERO = N - NB_GEN
+                     J_INC = 2
+                     J_FIRST_NZ = 2
+*
+                  ELSE IF( IMAT.EQ.13 ) THEN
+*
+*                    Even-numbered columns are zero.
+*
+                     NB_ZERO = N / 2
+                     NB_GEN = N - NB_ZERO
+                     J_INC = 2
+                     J_FIRST_NZ = 1
+*
+                  END IF
+*
+*
+*                 1) Set the first NB_ZERO columns in COPYA(1:M,1:N)
+*                    to zero.
+*
+                  CALL CLASET( 'Full', M, NB_ZERO, CZERO, CZERO,
+     $                         COPYA, LDA )
+*
+*                    2) Generate an M-by-(N-NB_ZERO) matrix with the
+*                       chosen singular value distribution
+*                       in COPYA(1:M,NB_ZERO+1:N).
+*
+                  CALL CLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU,
+     $                         ANORM, MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'CLATMS'
+*
+                  IND_OFFSET_GEN = NB_ZERO * LDA
+*
+                  CALL CLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE,
+     $                        CNDNUM, ANORM, KL, KU, 'No packing',
+     $                        COPYA( IND_OFFSET_GEN + 1 ), LDA,
+     $                        WORK, INFO )
+*
+*                 Check error code from CLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'CLATMS', INFO, 0, ' ', M,
+     $                            NB_GEN, -1, -1, -1, IMAT, NFAIL,
+     $                            NERRS, NOUT )
+                     CYCLE
+                  END IF
+*
+*                 3) Swap the gererated colums from the right side
+*                 NB_GEN-size block in COPYA into correct column
+*                 positions.
+*
+                  IF( IMAT.EQ.6
+     $                    .OR. IMAT.EQ.7
+     $                    .OR. IMAT.EQ.8
+     $                    .OR. IMAT.EQ.10
+     $                    .OR. IMAT.EQ.11 ) THEN
+*
+*                    Move by swapping the generated columns
+*                    from the right NB_GEN-size block from
+*                    (NB_ZERO+1:NB_ZERO+JB_ZERO)
+*                    into columns (1:JB_ZERO-1).
+*
+                     DO J = 1, JB_ZERO-1, 1
+                        CALL CSWAP( M,
+     $                        COPYA( ( NB_ZERO+J-1)*LDA+1), 1,
+     $                        COPYA( (J-1)*LDA + 1 ), 1 )
+                     END DO
+*
+                  ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN
+*
+*                    ( IMAT = 12, Odd-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the even zero colums in the
+*                    left NB_ZERO-size block.
+*
+*                    ( IMAT = 13, Even-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the odd zero colums in the
+*                    left NB_ZERO-size block.
+*
+                     DO J = 1, NB_GEN, 1
+                        IND_OUT = ( NB_ZERO+J-1 )*LDA + 1
+                        IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA
+     $                            + 1
+                        CALL CSWAP( M,
+     $                              COPYA( IND_OUT ), 1,
+     $                              COPYA( IND_IN), 1 )
+                        END DO
+*
+                  END IF
+*
+*                 5) Order the singular values generated by
+*                    DLAMTS in decreasing order and add trailing zeros
+*                    that correspond to zero columns.
+*                    The total number of singular values is MINMN.
+*
+                  MINMNB_GEN = MIN( M, NB_GEN )
+*
+                  CALL SLAORD( 'Decreasing', MINMNB_GEN, S, 1 )
+
+                  DO I = MINMNB_GEN+1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE
+*
+*                    IF(MINMN.LT.2) skip this size for this matrix type.
+*
+                     CYCLE
+               END IF
+*
+*              Initialize a copy array for a pivot array for DGEQP3RK.
+*
+               DO I = 1, N
+                  IWORK( I ) = 0
+               END DO
+*
+               DO INB = 1, NNB
+*
+*                 Do for each pair of values (NB,NX) in NBVAL and NXVAL.
+*
+                  NB = NBVAL( INB )
+                  CALL XLAENV( 1, NB )
+                  NX = NXVAL( INB )
+                  CALL XLAENV( 3, NX )
+*
+*                 We do MIN(M,N)+1 because we need a test for KMAX > N,
+*                 when KMAX is larger than MIN(M,N), KMAX should be
+*                 KMAX = MIN(M,N)
+*
+                  DO KMAX = 0, MIN(M,N)+1
+*
+*                 Get a working copy of COPYA into A( 1:M,1:N ).
+*                 Get a working copy of COPYB into A( 1:M, (N+1):NRHS ).
+*                 Get a working copy of COPYB into into B( 1:M, 1:NRHS ).
+*                 Get a working copy of IWORK(1:N) awith zeroes into
+*                 which is going to be used as pivot array IWORK( N+1:2N ).
+*                 NOTE: IWORK(2N+1:3N) is going to be used as a WORK array
+*                 for the routine.
+*
+                  CALL CLACPY( 'All', M, N, COPYA, LDA, A, LDA )
+                  CALL CLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         A( LDA*N + 1 ),  LDA )
+                  CALL CLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         B,  LDA )
+                  CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 )
+*
+                  ABSTOL = -1.0
+                  RELTOl = -1.0
+*
+*                 Compute the QR factorization with pivoting of A
+*
+                  LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ),
+     $                              3*N + NRHS - 1 ) )
+*
+*                 Compute CGEQP3RK factorization of A.
+*
+                  SRNAMT = 'CGEQP3RK'
+                  CALL CGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                           A, LDA, KFACT, MAXC2NRMK,
+     $                           RELMAXC2NRMK, IWORK( N+1 ), TAU,
+     $                           WORK, LW, RWORK, IWORK( 2*N+1 ),
+     $                           INFO )
+*
+*                 Check error code from CGEQP3RK.
+*
+                  IF( INFO.LT.0 )
+     $               CALL ALAERH( PATH, 'CGEQP3RK', INFO, 0, ' ',
+     $                            M, N, NX, -1, NB, IMAT,
+     $                            NFAIL, NERRS, NOUT )
+*
+                  IF( KFACT.EQ.MINMN ) THEN
+*
+*                 Compute test 1:
+*
+*                 This test in only for the full rank factorization of
+*                 the matrix A.
+*
+*                 Array S(1:min(M,N)) contains svd(A) the sigular values
+*                 of the original matrix A in decreasing absolute value
+*                 order. The test computes svd(R), the vector sigular
+*                 values of the upper trapezoid of A(1:M,1:N) that
+*                 contains the factor R, in decreasing order. The test
+*                 returns the ratio:
+*
+*                 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS )
+*
+                     RESULT( 1 ) = CQRT12( M, N, A, LDA, S, WORK,
+     $                                     LWORK , RWORK )
+*
+                     DO T = 1, 1
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL, NB, NX,
+     $                        IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                   End test 1
+*
+                  END IF
+
+*                 Compute test 2:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS )
+*
+                  RESULT( 2 ) = CQPT01( M, N, KFACT, COPYA, A, LDA, TAU,
+     $                          IWORK( N+1 ), WORK, LWORK )
+*
+*                 Compute test 3:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( Q**T * Q - I ) / ( M * EPS )
+*
+                  RESULT( 3 ) = CQRT11( M, KFACT, A, LDA, TAU, WORK,
+     $                          LWORK )
+*
+*                 Print information about the tests that did not pass
+*                 the threshold.
+*
+                  DO T = 2, 3
+                     IF( RESULT( T ).GE.THRESH ) THEN
+                        IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                     CALL ALAHD( NOUT, PATH )
+                        WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', M, N,
+     $                      NRHS, KMAX, ABSTOL, RELTOL,
+     $                      NB, NX, IMAT, T, RESULT( T )
+                        NFAIL = NFAIL + 1
+                     END IF
+                  END DO
+                  NRUN = NRUN + 2
+*
+*                 Compute test 4:
+*
+*                 This test is only for the factorizations with the
+*                 rank greater than 2.
+*                 The elements on the diagonal of R should be non-
+*                 increasing.
+*
+*                 The test returns the ratio:
+*
+*                 Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)),
+*                 K=1:KFACT-1
+*
+                  IF( MIN(KFACT, MINMN).GE.2 ) THEN
+*
+                     DO J = 1, KFACT-1, 1
+*
+                        DTEMP = (( ABS( A( (J-1)*M+J ) ) -
+     $                          ABS( A( (J)*M+J+1 ) ) ) /
+     $                          ABS( A(1) ) )
+*
+                        IF( DTEMP.LT.ZERO ) THEN
+                           RESULT( 4 ) = BIGNUM
+                        END IF
+*
+                     END DO
+*
+*                    Print information about the tests that did not
+*                    pass the threshold.
+*
+                     DO T = 4, 4
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK',
+     $                        M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T,
+     $                        RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End test 4.
+*
+                  END IF
+*
+*                 Compute test 5:
+*
+*                 This test in only for matrix A with min(M,N) > 0.
+*
+*                 The test returns the ratio:
+*
+*                 1-norm(Q**T * B - Q**T * B ) /
+*                       ( M * EPS )
+*
+*                 (1) Compute B:=Q**T * B in the matrix B.
+*
+                  IF( MINMN.GT.0 ) THEN
+*
+                     LWORK_MQR = MAX(1, NRHS)
+                     CALL CUNMQR( 'Left', 'Conjugate transpose',
+     $                         M, NRHS, KFACT, A, LDA, TAU, B, LDA,
+     $                         WORK, LWORK_MQR, INFO )
+*
+                     DO I = 1, NRHS
+*
+*                       Compare N+J-th column of A and J-column of B.
+*
+                        CALL CAXPY( M, -CONE, A( ( N+I-1 )*LDA+1 ), 1,
+     $                                    B( ( I-1 )*LDA+1 ), 1 )
+                     END DO
+*
+                     RESULT( 5 ) =
+     $               ABS(
+     $               CLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) /
+     $               ( REAL( M )*SLAMCH( 'Epsilon' ) )
+     $               )
+*
+*                    Print information about the tests that did not pass
+*                    the threshold.
+*
+                     DO T = 5, 5
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End compute test 5.
+*
+                  END IF
+*
+*                 END DO KMAX = 1, MIN(M,N)+1
+*
+                  END DO
+*
+*                 END DO for INB = 1, NNB
+*
+               END DO
+*
+*              END DO  for IMAT = 1, NTYPES
+*
+               END DO
+*
+*              END DO for INS = 1, NNS
+*
+            END DO
+*
+*           END DO for IN = 1, NN
+*
+         END DO
+*
+*        END DO for IM = 1, NM
+*
+      END DO
+*
+*     Print a summary of the results.
+*
+      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
+*
+ 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5,
+     $        ', KMAX =', I5, ', ABSTOL =', G12.5,
+     $        ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4,
+     $        ', type ', I2, ', test ', I2, ', ratio =', G12.5 )
+*
+*     End of CCHKQP3RK
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f
index 51cef512d..83e8a17b0 100644
--- a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f
@@ -400,9 +400,9 @@
 *                    Factor the matrix and solve the system using CHESV_AA.
 *
                      SRNAMT = 'CHESV_AA_2STAGE '
-                     LWORK = MIN(N*NB, 3*NMAX*NMAX)
+                     LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX)
                      CALL CHESV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
-     $                                 AINV, (3*NB+1)*N, 
+     $                                 AINV, MAX( 1, (3*NB+1)*N ),
      $                                 IWORK, IWORK( 1+N ),
      $                                 X, LDA, WORK, LWORK, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/clatb4.f b/lapack-netlib/TESTING/LIN/clatb4.f
index eeb0f03a9..233a8631a 100644
--- a/lapack-netlib/TESTING/LIN/clatb4.f
+++ b/lapack-netlib/TESTING/LIN/clatb4.f
@@ -154,9 +154,6 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           SLABAD
-*     ..
 *     .. Save statement ..
       SAVE               EPS, SMALL, LARGE, BADC1, BADC2, FIRST
 *     ..
@@ -174,11 +171,6 @@
          BADC1 = SQRT( BADC2 )
          SMALL = SLAMCH( 'Safe minimum' )
          LARGE = ONE / SMALL
-*
-*        If it looks like we're on a Cray, take the square root of
-*        SMALL and LARGE to avoid overflow and underflow problems.
-*
-         CALL SLABAD( SMALL, LARGE )
          SMALL = SHRINK*( SMALL / EPS )
          LARGE = ONE / SMALL
       END IF
@@ -233,6 +225,110 @@
          ELSE
             ANORM = ONE
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        xQK: truncated QR with pivoting.
+*             Set parameters to generate a general
+*             M x N matrix.
+*
+*        Set TYPE, the type of matrix to be generated.  'N' is nonsymmetric.
+*
+         TYPE = 'N'
+*
+*        Set DIST, the type of distribution for the random
+*        number generator. 'S' is
+*
+         DIST = 'S'
+*
+*        Set the lower and upper bandwidths.
+*
+         IF( IMAT.EQ.2 ) THEN
+*
+*           2. Random, Diagonal, CNDNUM = 2
+*
+            KL = 0
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.3 ) THEN
+*
+*           3. Random, Upper triangular,  CNDNUM = 2
+*
+            KL = 0
+            KU = MAX( N-1, 0 )
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.4 ) THEN
+*
+*          4. Random, Lower triangular,  CNDNUM = 2
+*
+            KL = MAX( M-1, 0 )
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE
+*
+*           5.-19. Rectangular matrix
+*
+            KL = MAX( M-1, 0 )
+            KU = MAX( N-1, 0 )
+*
+            IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN
+*
+*              5.-14. Random, CNDNUM = 2.
+*
+               CNDNUM = TWO
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.15 ) THEN
+*
+*              15. Random, CNDNUM = sqrt(0.1/EPS)
+*
+               CNDNUM = BADC1
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.16 ) THEN
+*
+*              16. Random, CNDNUM = 0.1/EPS
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.17 ) THEN
+*
+*              17. Random, CNDNUM = 0.1/EPS,
+*                  one small singular value S(N)=1/CNDNUM
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 2
+*
+            ELSE IF( IMAT.EQ.18 ) THEN
+*
+*              18. Random, scaled near underflow
+*
+               CNDNUM = TWO
+               ANORM = SMALL
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.19 ) THEN
+*
+*              19. Random, scaled near overflow
+*
+               CNDNUM = TWO
+               ANORM = LARGE
+               MODE = 3
+*
+            END IF
+*
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN
 *
@@ -517,17 +613,18 @@
 *
 *        Set the norm and condition number.
 *
-         IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN
+         MAT = ABS( IMAT )
+         IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN
             CNDNUM = BADC1
-         ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN
+         ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN
             CNDNUM = BADC2
          ELSE
             CNDNUM = TWO
          END IF
 *
-         IF( IMAT.EQ.4 ) THEN
+         IF( MAT.EQ.4 ) THEN
             ANORM = SMALL
-         ELSE IF( IMAT.EQ.5 ) THEN
+         ELSE IF( MAT.EQ.5 ) THEN
             ANORM = LARGE
          ELSE
             ANORM = ONE
diff --git a/lapack-netlib/TESTING/LIN/cqpt01.f b/lapack-netlib/TESTING/LIN/cqpt01.f
index 79fc2dc66..149c5bb7c 100644
--- a/lapack-netlib/TESTING/LIN/cqpt01.f
+++ b/lapack-netlib/TESTING/LIN/cqpt01.f
@@ -33,7 +33,8 @@
 *> Householder vectors, and the rest of AF contains a partially updated
 *> matrix.
 *>
-*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M)
+*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) )
+*> where || . || is matrix one norm.
 *> \endverbatim
 *
 *  Arguments:
@@ -172,28 +173,28 @@
 *
       NORMA = CLANGE( 'One-norm', M, N, A, LDA, RWORK )
 *
-      DO 30 J = 1, K
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, K
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = AF( I, J )
-   10    CONTINUE
-         DO 20 I = J + 1, M
+         END DO
+         DO I = J + 1, M
             WORK( ( J-1 )*M+I ) = ZERO
-   20    CONTINUE
-   30 CONTINUE
-      DO 40 J = K + 1, N
+         END DO
+      END DO
+      DO J = K + 1, N
          CALL CCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 )
-   40 CONTINUE
+      END DO
 *
       CALL CUNMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK,
      $             M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
-      DO 50 J = 1, N
+      DO J = 1, N
 *
 *        Compare i-th column of QR and jpvt(i)-th column of A
 *
          CALL CAXPY( M, CMPLX( -ONE ), A( 1, JPVT( J ) ), 1,
      $               WORK( ( J-1 )*M+1 ), 1 )
-   50 CONTINUE
+      END DO
 *
       CQPT01 = CLANGE( 'One-norm', M, N, WORK, M, RWORK ) /
      $         ( REAL( MAX( M, N ) )*SLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/cqrt11.f b/lapack-netlib/TESTING/LIN/cqrt11.f
index 494d5e9cd..a52084973 100644
--- a/lapack-netlib/TESTING/LIN/cqrt11.f
+++ b/lapack-netlib/TESTING/LIN/cqrt11.f
@@ -157,9 +157,9 @@
       CALL CUNM2R( 'Left', 'Conjugate transpose', M, M, K, A, LDA, TAU,
      $             WORK, M, WORK( M*M+1 ), INFO )
 *
-      DO 10 J = 1, M
+      DO J = 1, M
          WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE
-   10 CONTINUE
+      END DO
 *
       CQRT11 = CLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) /
      $         ( REAL( M )*SLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/cqrt12.f b/lapack-netlib/TESTING/LIN/cqrt12.f
index 4c29423ae..0df2d833b 100644
--- a/lapack-netlib/TESTING/LIN/cqrt12.f
+++ b/lapack-netlib/TESTING/LIN/cqrt12.f
@@ -28,7 +28,7 @@
 *> CQRT12 computes the singular values `svlues' of the upper trapezoid
 *> of A(1:M,1:N) and returns the ratio
 *>
-*>      || s - svlues||/(||svlues||*eps*max(M,N))
+*>      || svlues -s ||/( ||s||*eps*max(M,N) )
 *> \endverbatim
 *
 *  Arguments:
@@ -125,8 +125,8 @@
       EXTERNAL           CLANGE, SASUM, SLAMCH, SNRM2
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CGEBD2, CLASCL, CLASET, SAXPY, SBDSQR, SLABAD,
-     $                   SLASCL, XERBLA
+      EXTERNAL           CGEBD2, CLASCL, CLASET, SAXPY, SBDSQR, SLASCL,
+     $                   XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          CMPLX, MAX, MIN, REAL
@@ -153,17 +153,16 @@
 *     Copy upper triangle of A into work
 *
       CALL CLASET( 'Full', M, N, CMPLX( ZERO ), CMPLX( ZERO ), WORK, M )
-      DO 20 J = 1, N
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, N
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = A( I, J )
-   10    CONTINUE
-   20 CONTINUE
+         END DO
+      END DO
 *
 *     Get machine parameters
 *
       SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale work if max entry outside range [SMLNUM,BIGNUM]
 *
@@ -207,9 +206,9 @@
 *
       ELSE
 *
-         DO 30 I = 1, MN
+         DO I = 1, MN
             RWORK( I ) = ZERO
-   30    CONTINUE
+         END DO
       END IF
 *
 *     Compare s and singular values of work
diff --git a/lapack-netlib/TESTING/LIN/dchkaa.F b/lapack-netlib/TESTING/LIN/dchkaa.F
index ef9d7808c..6582cac13 100644
--- a/lapack-netlib/TESTING/LIN/dchkaa.F
+++ b/lapack-netlib/TESTING/LIN/dchkaa.F
@@ -63,6 +63,7 @@
 *> DLQ    8               List types on next line if 0 < NTYPES <  8
 *> DQL    8               List types on next line if 0 < NTYPES <  8
 *> DQP    6               List types on next line if 0 < NTYPES <  6
+*> DQK    19              List types on next line if 0 < NTYPES <  19
 *> DTZ    3               List types on next line if 0 < NTYPES <  3
 *> DLS    6               List types on next line if 0 < NTYPES <  6
 *> DEQ
@@ -149,12 +150,12 @@
      $                   NBVAL( MAXIN ), NBVAL2( MAXIN ),
      $                   NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ),
      $                   RANKVAL( MAXIN ), PIV( NMAX )
-      DOUBLE PRECISION   E( NMAX ), S( 2*NMAX )
 *     ..
 *     .. Allocatable Arrays ..
       INTEGER AllocateStatus
-      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK
-      DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK      
+      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK, S
+      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: E
+      DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, LSAMEN
@@ -164,13 +165,13 @@
 *     .. External Subroutines ..
       EXTERNAL           ALAREQ, DCHKEQ, DCHKGB, DCHKGE, DCHKGT, DCHKLQ,
      $                   DCHKORHR_COL, DCHKPB, DCHKPO, DCHKPS, DCHKPP,
-     $                   DCHKPT, DCHKQ3, DCHKQL, DCHKQR, DCHKRQ, DCHKSP,
-     $                   DCHKSY, DCHKSY_ROOK, DCHKSY_RK, DCHKSY_AA,
-     $                   DCHKTB, DCHKTP, DCHKTR, DCHKTZ, DDRVGB, DDRVGE,
-     $                   DDRVGT, DDRVLS, DDRVPB, DDRVPO, DDRVPP, DDRVPT,
-     $                   DDRVSP, DDRVSY, DDRVSY_ROOK, DDRVSY_RK,
-     $                   DDRVSY_AA, ILAVER, DCHKLQTP, DCHKQRT, DCHKQRTP,
-     $                   DCHKLQT,DCHKTSQR
+     $                   DCHKPT, DCHKQ3, DCHKQP3RK, DCHKQL, DCHKQR,
+     $                   DCHKRQ, DCHKSP, DCHKSY, DCHKSY_ROOK, DCHKSY_RK,
+     $                   DCHKSY_AA, DCHKTB, DCHKTP, DCHKTR, DCHKTZ,
+     $                   DDRVGB, DDRVGE, DDRVGT, DDRVLS, DDRVPB, DDRVPO,
+     $                   DDRVPP, DDRVPT, DDRVSP, DDRVSY, DDRVSY_ROOK,
+     $                   DDRVSY_RK, DDRVSY_AA, ILAVER, DCHKLQTP, DCHKQRT,
+     $                   DCHKQRTP, DCHKLQT,DCHKTSQR
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -197,6 +198,10 @@
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
       ALLOCATE ( WORK( NMAX, 3*NMAX+MAXRHS+30 ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( E( NMAX ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
       ALLOCATE ( RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
 *
@@ -919,9 +924,26 @@
          CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
 *
          IF( TSTCHK ) THEN
-            CALL DCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL,
-     $                   THRESH, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ),
-     $                   B( 1, 3 ), WORK, IWORK, NOUT )
+            CALL DCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL,
+     $                   NXVAL, THRESH, A( 1, 1 ), A( 1, 2 ),
+     $                   B( 1, 1 ), B( 1, 3 ), WORK, IWORK, NOUT )
+         ELSE
+            WRITE( NOUT, FMT = 9989 )PATH
+         END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        QK: truncated QR factorization with pivoting
+*
+         NTYPES = 19
+         CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
+*
+         IF( TSTCHK ) THEN
+            CALL DCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                      NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ),
+     $                      A( 1, 2 ), B( 1, 1 ), B( 1, 2 ),
+     $                      B( 1, 3 ), B( 1, 4 ),
+     $                      WORK, IWORK, NOUT )
          ELSE
             WRITE( NOUT, FMT = 9989 )PATH
          END IF
@@ -1054,6 +1076,8 @@
 *
       DEALLOCATE (A, STAT = AllocateStatus)
       DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (E, STAT = AllocateStatus)
+      DEALLOCATE (S, STAT = AllocateStatus)
       DEALLOCATE (WORK, STAT = AllocateStatus)
       DEALLOCATE (RWORK,  STAT = AllocateStatus)
 *
diff --git a/lapack-netlib/TESTING/LIN/dchkq3.f b/lapack-netlib/TESTING/LIN/dchkq3.f
index 1fdf07252..494008fa8 100644
--- a/lapack-netlib/TESTING/LIN/dchkq3.f
+++ b/lapack-netlib/TESTING/LIN/dchkq3.f
@@ -30,7 +30,7 @@
 *>
 *> \verbatim
 *>
-*> DCHKQ3 tests DGEQP3.
+*> DCHKQ3 tests  DGEQP3.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/TESTING/LIN/dchkqp3rk.f b/lapack-netlib/TESTING/LIN/dchkqp3rk.f
new file mode 100644
index 000000000..434d2067e
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/dchkqp3rk.f
@@ -0,0 +1,832 @@
+*> \brief \b DCHKQP3RK
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE DCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+*     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+*     $                      B, COPYB, S, TAU,
+*     $                      WORK, IWORK, NOUT )
+*      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+*      INTEGER            NM, NN, NNS, NNB, NOUT
+*      DOUBLE PRECISION   THRESH
+*     ..
+*     .. Array Arguments ..
+*      LOGICAL            DOTYPE( * )
+*      INTEGER            IWORK( * ), MVAL( * ), NBVAL( * ), NSVAL( * ),
+*     $                   NVAL( * ), NXVAL( * )
+*      DOUBLE PRECISION   A( * ), COPYA( * ), B( * ), COPYB( * ),
+*     $                   S( * ), TAU( * ), WORK( * )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DCHKQP3RK tests DGEQP3RK.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] DOTYPE
+*> \verbatim
+*>          DOTYPE is LOGICAL array, dimension (NTYPES)
+*>          The matrix types to be used for testing.  Matrices of type j
+*>          (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) =
+*>          .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used.
+*> \endverbatim
+*>
+*> \param[in] NM
+*> \verbatim
+*>          NM is INTEGER
+*>          The number of values of M contained in the vector MVAL.
+*> \endverbatim
+*>
+*> \param[in] MVAL
+*> \verbatim
+*>          MVAL is INTEGER array, dimension (NM)
+*>          The values of the matrix row dimension M.
+*> \endverbatim
+*>
+*> \param[in] NN
+*> \verbatim
+*>          NN is INTEGER
+*>          The number of values of N contained in the vector NVAL.
+*> \endverbatim
+*>
+*> \param[in] NVAL
+*> \verbatim
+*>          NVAL is INTEGER array, dimension (NN)
+*>          The values of the matrix column dimension N.
+*> \endverbatim
+*>
+*> \param[in] NNS
+*> \verbatim
+*>          NNS is INTEGER
+*>          The number of values of NRHS contained in the vector NSVAL.
+*> \endverbatim
+*>
+*> \param[in] NSVAL
+*> \verbatim
+*>          NSVAL is INTEGER array, dimension (NNS)
+*>          The values of the number of right hand sides NRHS.
+*> \endverbatim
+*>
+*> \param[in] NNB
+*> \verbatim
+*>          NNB is INTEGER
+*>          The number of values of NB and NX contained in the
+*>          vectors NBVAL and NXVAL.  The blocking parameters are used
+*>          in pairs (NB,NX).
+*> \endverbatim
+*>
+*> \param[in] NBVAL
+*> \verbatim
+*>          NBVAL is INTEGER array, dimension (NNB)
+*>          The values of the blocksize NB.
+*> \endverbatim
+*>
+*> \param[in] NXVAL
+*> \verbatim
+*>          NXVAL is INTEGER array, dimension (NNB)
+*>          The values of the crossover point NX.
+*> \endverbatim
+*>
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is DOUBLE PRECISION
+*>          The threshold value for the test ratios.  A result is
+*>          included in the output file if RESULT >= THRESH.  To have
+*>          every test ratio printed, use THRESH = 0.
+*> \endverbatim
+*>
+*> \param[out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (MMAX*NMAX)
+*>          where MMAX is the maximum value of M in MVAL and NMAX is the
+*>          maximum value of N in NVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYA
+*> \verbatim
+*>          COPYA is DOUBLE PRECISION array, dimension (MMAX*NMAX)
+*> \endverbatim
+*>
+*> \param[out] B
+*> \verbatim
+*>          B is DOUBLE PRECISION array, dimension (MMAX*NSMAX)
+*>          where MMAX is the maximum value of M in MVAL and NSMAX is the
+*>          maximum value of NRHS in NSVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYB
+*> \verbatim
+*>          COPYB is DOUBLE PRECISION array, dimension (MMAX*NSMAX)
+*> \endverbatim
+*>
+*> \param[out] S
+*> \verbatim
+*>          S is DOUBLE PRECISION array, dimension
+*>                      (min(MMAX,NMAX))
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is DOUBLE PRECISION array, dimension (MMAX)
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension
+*>                      (MMAX*NMAX + 4*NMAX + MMAX)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (2*NMAX)
+*> \endverbatim
+*>
+*> \param[in] NOUT
+*> \verbatim
+*>          NOUT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup double_lin
+*
+*  =====================================================================
+      SUBROUTINE DCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+     $                      B, COPYB, S, TAU,
+     $                      WORK, IWORK, NOUT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            NM, NN, NNB, NNS, NOUT
+      DOUBLE PRECISION   THRESH
+*     ..
+*     .. Array Arguments ..
+      LOGICAL            DOTYPE( * )
+      INTEGER            IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ),
+     $                   NSVAL( * ), NXVAL( * )
+      DOUBLE PRECISION   A( * ), COPYA( * ), B( * ), COPYB( * ),
+     $                   S( * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NTYPES
+      PARAMETER          ( NTYPES = 19 )
+      INTEGER            NTESTS
+      PARAMETER          ( NTESTS = 5 )
+      DOUBLE PRECISION   ONE, ZERO, BIGNUM
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0,
+     $                     BIGNUM = 1.0D+38 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          DIST, TYPE
+      CHARACTER*3        PATH
+      INTEGER            I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO,
+     $                   INB, IND_OFFSET_GEN,
+     $                   IND_IN, IND_OUT, INS, INFO,
+     $                   ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO,
+     $                   KFACT, KL, KMAX, KU, LDA, LW, LWORK,
+     $                   LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N,
+     $                   NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS,
+     $                   NRUN, NX, T
+      DOUBLE PRECISION   ANORM, CNDNUM, EPS, ABSTOL, RELTOL,
+     $                   DTEMP, MAXC2NRMK, RELMAXC2NRMK
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 ), ISEEDY( 4 )
+      DOUBLE PRECISION   RESULT( NTESTS ), RDUMMY( 1 )
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH, DQPT01, DQRT11, DQRT12, DLANGE,
+     $                   DLAPY2
+      EXTERNAL           DLAMCH, DQPT01, DQRT11, DQRT12, DLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAERH, ALAHD, ALASUM, DAXPY, DGEQP3RK,
+     $                   DLACPY, DLAORD, DLASET, DLATB4, DLATMS,
+     $                   DORMQR, DSWAP, ICOPY, XLAENV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, MAX, MIN, MOD
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER*32       SRNAMT
+      INTEGER            INFOT, IOUNIT
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, IOUNIT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA               ISEEDY / 1988, 1989, 1990, 1991 /
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize constants and the random number seed.
+*
+      PATH( 1: 1 ) = 'Double precision'
+      PATH( 2: 3 ) = 'QK'
+      NRUN = 0
+      NFAIL = 0
+      NERRS = 0
+      DO I = 1, 4
+         ISEED( I ) = ISEEDY( I )
+      END DO
+      EPS = DLAMCH( 'Epsilon' )
+      INFOT = 0
+*
+      DO IM = 1, NM
+*
+*        Do for each value of M in MVAL.
+*
+         M = MVAL( IM )
+         LDA = MAX( 1, M )
+*
+         DO IN = 1, NN
+*
+*           Do for each value of N in NVAL.
+*
+            N = NVAL( IN )
+            MINMN = MIN( M, N )
+            LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ),
+     $                   M*N + 2*MINMN + 4*N )
+*
+            DO INS = 1, NNS
+               NRHS = NSVAL( INS )
+*
+*                 Set up parameters with DLATB4 and generate
+*                 M-by-NRHS B matrix with DLATMS.
+*                 IMAT = 14:
+*                 Random matrix, CNDNUM = 2, NORM = ONE,
+*                 MODE = 3 (geometric distribution of singular values).
+*
+                  CALL DLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'DLATMS'
+                  CALL DLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYB, LDA, WORK, INFO )
+
+
+*
+*                 Check error code from DLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'DLATMS', INFO, 0, ' ', M,
+     $                            NRHS, -1, -1, -1, 6, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+               DO IMAT = 1, NTYPES
+*
+*              Do the tests only if DOTYPE( IMAT ) is true.
+*
+               IF( .NOT.DOTYPE( IMAT ) )
+     $            CYCLE
+*
+*              The type of distribution used to generate the random
+*              eigen-/singular values:
+*              ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 )
+*
+*           Do for each type of NON-SYMMETRIC matrix:                               CNDNUM                     NORM                                     MODE
+*            1. Zero matrix
+*            2. Random, Diagonal, CNDNUM = 2                                        CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            3. Random, Upper triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            4. Random, Lower triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            5. Random, First column is zero, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            6. Random, Last MINMN column is zero, CNDNUM = 2                       CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            7. Random, Last N column is zero, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            8. Random, Middle column in MINMN is zero, CNDNUM = 2                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            9. Random, First half of MINMN columns are zero, CNDNUM = 2            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2   CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           11. Random, Half MINMN columns in the middle are zero starting
+*                  from  MINMN/2-(MINMN/2)/2+1, CNDNUM = 2                          CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           12. Random, Odd columns are ZERO, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           13. Random, Even columns are ZERO, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           14. Random, CNDNUM = 2                                                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           15. Random, CNDNUM = sqrt(0.1/EPS)                                      CNDNUM = BADC1 = sqrt(0.1/EPS)  ONE                                      3 ( geometric distribution of singular values )
+*           16. Random, CNDNUM = 0.1/EPS                                            CNDNUM = BADC2 = 0.1/EPS        ONE                                      3 ( geometric distribution of singular values )
+*           17. Random, CNDNUM = 0.1/EPS,                                           CNDNUM = BADC2 = 0.1/EPS        ONE                                      2 ( one small singular value, S(N)=1/CNDNUM )
+*                 one small singular value S(N)=1/CNDNUM
+*           18. Random, CNDNUM = 2, scaled near underflow                           CNDNUM = 2                      SMALL = SAFMIN
+*           19. Random, CNDNUM = 2, scaled near overflow                            CNDNUM = 2                      LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) )  3 ( geometric distribution of singular values )
+*
+               IF( IMAT.EQ.1 ) THEN
+*
+*                 Matrix 1: Zero matrix
+*
+                  CALL DLASET( 'Full', M, N, ZERO, ZERO, COPYA, LDA )
+                  DO I = 1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 )
+     $                  .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN
+*
+*                 Matrices 2-5.
+*
+*                 Set up parameters with DLATB4 and generate a test
+*                 matrix with DLATMS.
+*
+                  CALL DLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'DLATMS'
+                  CALL DLATMS( M, N, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYA, LDA, WORK, INFO )
+*
+*                 Check error code from DLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'DLATMS', INFO, 0, ' ', M, N,
+     $                            -1, -1, -1, IMAT, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+                  CALL DLAORD( 'Decreasing', MINMN, S, 1 )
+*
+               ELSE IF( MINMN.GE.2
+     $                  .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN
+*
+*                 Rectangular matrices 5-13 that contain zero columns,
+*                 only for matrices MINMN >=2.
+*
+*                 JB_ZERO is the column index of ZERO block.
+*                 NB_ZERO is the column block size of ZERO block.
+*                 NB_GEN is the column blcok size of the
+*                 generated block.
+*                 J_INC in the non_zero column index increment
+*                 for matrix 12 and 13.
+*                 J_FIRS_NZ is the index of the first non-zero
+*                 column.
+*
+                  IF( IMAT.EQ.5 ) THEN
+*
+*                    First column is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.6 ) THEN
+*
+*                    Last column MINMN is zero.
+*
+                     JB_ZERO = MINMN
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.7 ) THEN
+*
+*                    Last column N is zero.
+*
+                     JB_ZERO = N
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.8 ) THEN
+*
+*                    Middle column in MINMN is zero.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.9 ) THEN
+*
+*                    First half of MINMN columns is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.10 ) THEN
+*
+*                    Last columns are zero columns,
+*                    starting from (MINMN / 2 + 1) column.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = N - JB_ZERO + 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.11 ) THEN
+*
+*                    Half of the columns in the middle of MINMN
+*                    columns is zero, starting from
+*                    MINMN/2 - (MINMN/2)/2 + 1 column.
+*
+                     JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.12 ) THEN
+*
+*                    Odd-numbered columns are zero,
+*
+                     NB_GEN = N / 2
+                     NB_ZERO = N - NB_GEN
+                     J_INC = 2
+                     J_FIRST_NZ = 2
+*
+                  ELSE IF( IMAT.EQ.13 ) THEN
+*
+*                    Even-numbered columns are zero.
+*
+                     NB_ZERO = N / 2
+                     NB_GEN = N - NB_ZERO
+                     J_INC = 2
+                     J_FIRST_NZ = 1
+*
+                  END IF
+*
+*
+*                 1) Set the first NB_ZERO columns in COPYA(1:M,1:N)
+*                    to zero.
+*
+                  CALL DLASET( 'Full', M, NB_ZERO, ZERO, ZERO,
+     $                         COPYA, LDA )
+*
+*                    2) Generate an M-by-(N-NB_ZERO) matrix with the
+*                       chosen singular value distribution
+*                       in COPYA(1:M,NB_ZERO+1:N).
+*
+                  CALL DLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU,
+     $                         ANORM, MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'DLATMS'
+*
+                  IND_OFFSET_GEN = NB_ZERO * LDA
+*
+                  CALL DLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE,
+     $                        CNDNUM, ANORM, KL, KU, 'No packing',
+     $                        COPYA( IND_OFFSET_GEN + 1 ), LDA,
+     $                        WORK, INFO )
+*
+*                 Check error code from DLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'DLATMS', INFO, 0, ' ', M,
+     $                            NB_GEN, -1, -1, -1, IMAT, NFAIL,
+     $                            NERRS, NOUT )
+                     CYCLE
+                  END IF
+*
+*                 3) Swap the gererated colums from the right side
+*                 NB_GEN-size block in COPYA into correct column
+*                 positions.
+*
+                  IF( IMAT.EQ.6
+     $                    .OR. IMAT.EQ.7
+     $                    .OR. IMAT.EQ.8
+     $                    .OR. IMAT.EQ.10
+     $                    .OR. IMAT.EQ.11 ) THEN
+*
+*                    Move by swapping the generated columns
+*                    from the right NB_GEN-size block from
+*                    (NB_ZERO+1:NB_ZERO+JB_ZERO)
+*                    into columns (1:JB_ZERO-1).
+*
+                     DO J = 1, JB_ZERO-1, 1
+                        CALL DSWAP( M,
+     $                        COPYA( ( NB_ZERO+J-1)*LDA+1), 1,
+     $                        COPYA( (J-1)*LDA + 1 ), 1 )
+                     END DO
+*
+                  ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN
+*
+*                    ( IMAT = 12, Odd-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the even zero colums in the
+*                    left NB_ZERO-size block.
+*
+*                    ( IMAT = 13, Even-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the odd zero colums in the
+*                    left NB_ZERO-size block.
+*
+                     DO J = 1, NB_GEN, 1
+                        IND_OUT = ( NB_ZERO+J-1 )*LDA + 1
+                        IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA
+     $                            + 1
+                        CALL DSWAP( M,
+     $                              COPYA( IND_OUT ), 1,
+     $                              COPYA( IND_IN), 1 )
+                        END DO
+*
+                  END IF
+*
+*                 5) Order the singular values generated by
+*                    DLAMTS in decreasing order and add trailing zeros
+*                    that correspond to zero columns.
+*                    The total number of singular values is MINMN.
+*
+                  MINMNB_GEN = MIN( M, NB_GEN )
+*
+                  DO I = MINMNB_GEN+1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE
+*
+*                    IF(MINMN.LT.2) skip this size for this matrix type.
+*
+                     CYCLE
+               END IF
+*
+*              Initialize a copy array for a pivot array for DGEQP3RK.
+*
+               DO I = 1, N
+                  IWORK( I ) = 0
+               END DO
+*
+               DO INB = 1, NNB
+*
+*                 Do for each pair of values (NB,NX) in NBVAL and NXVAL.
+*
+                  NB = NBVAL( INB )
+                  CALL XLAENV( 1, NB )
+                  NX = NXVAL( INB )
+                  CALL XLAENV( 3, NX )
+*
+*                 We do MIN(M,N)+1 because we need a test for KMAX > N,
+*                 when KMAX is larger than MIN(M,N), KMAX should be
+*                 KMAX = MIN(M,N)
+*
+                  DO KMAX = 0, MIN(M,N)+1
+*
+*                 Get a working copy of COPYA into A( 1:M,1:N ).
+*                 Get a working copy of COPYB into A( 1:M, (N+1):NRHS ).
+*                 Get a working copy of COPYB into into B( 1:M, 1:NRHS ).
+*                 Get a working copy of IWORK(1:N) awith zeroes into
+*                 which is going to be used as pivot array IWORK( N+1:2N ).
+*                 NOTE: IWORK(2N+1:3N) is going to be used as a WORK array
+*                 for the routine.
+*
+                  CALL DLACPY( 'All', M, N, COPYA, LDA, A, LDA )
+                  CALL DLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         A( LDA*N + 1 ),  LDA )
+                  CALL DLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         B,  LDA )
+                  CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 )
+*
+                  ABSTOL = -1.0
+                  RELTOL = -1.0
+*
+*                 Compute the QR factorization with pivoting of A
+*
+                  LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ),
+     $                              3*N + NRHS - 1 ) )
+*
+*                 Compute DGEQP3RK factorization of A.
+*
+                  SRNAMT = 'DGEQP3RK'
+                  CALL DGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                           A, LDA, KFACT, MAXC2NRMK,
+     $                           RELMAXC2NRMK, IWORK( N+1 ), TAU,
+     $                           WORK, LW, IWORK( 2*N+1 ), INFO )
+*
+*                 Check error code from DGEQP3RK.
+*
+                  IF( INFO.LT.0 )
+     $               CALL ALAERH( PATH, 'DGEQP3RK', INFO, 0, ' ',
+     $                            M, N, NX, -1, NB, IMAT,
+     $                            NFAIL, NERRS, NOUT )
+*
+*                 Compute test 1:
+*
+*                 This test in only for the full rank factorization of
+*                 the matrix A.
+*
+*                 Array S(1:min(M,N)) contains svd(A) the sigular values
+*                 of the original matrix A in decreasing absolute value
+*                 order. The test computes svd(R), the vector sigular
+*                 values of the upper trapezoid of A(1:M,1:N) that
+*                 contains the factor R, in decreasing order. The test
+*                 returns the ratio:
+*
+*                 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS )
+*
+                  IF( KFACT.EQ.MINMN ) THEN
+*
+                     RESULT( 1 ) = DQRT12( M, N, A, LDA, S, WORK,
+     $                                     LWORK )
+*
+                     DO T = 1, 1
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL, NB, NX,
+     $                        IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                   End test 1
+*
+                  END IF
+*
+*                 Compute test 2:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS )
+*
+                  RESULT( 2 ) = DQPT01( M, N, KFACT, COPYA, A, LDA, TAU,
+     $                          IWORK( N+1 ), WORK, LWORK )
+*
+*                 Compute test 3:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( Q**T * Q - I ) / ( M * EPS )
+*
+                  RESULT( 3 ) = DQRT11( M, KFACT, A, LDA, TAU, WORK,
+     $                          LWORK )
+*
+*                 Print information about the tests that did not pass
+*                 the threshold.
+*
+                  DO T = 2, 3
+                     IF( RESULT( T ).GE.THRESH ) THEN
+                        IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                     CALL ALAHD( NOUT, PATH )
+                        WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', M, N,
+     $                      NRHS, KMAX, ABSTOL, RELTOL,
+     $                      NB, NX, IMAT, T, RESULT( T )
+                        NFAIL = NFAIL + 1
+                     END IF
+                  END DO
+                  NRUN = NRUN + 2
+*
+*                 Compute test 4:
+*
+*                 This test is only for the factorizations with the
+*                 rank greater than 2.
+*                 The elements on the diagonal of R should be non-
+*                 increasing.
+*
+*                 The test returns the ratio:
+*
+*                 Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)),
+*                 K=1:KFACT-1
+*
+                  IF( MIN(KFACT, MINMN).GE.2 ) THEN
+*
+                     DO J = 1, KFACT-1, 1
+
+                        DTEMP = (( ABS( A( (J-1)*M+J ) ) -
+     $                          ABS( A( (J)*M+J+1 ) ) ) /
+     $                          ABS( A(1) ) )
+*
+                        IF( DTEMP.LT.ZERO ) THEN
+                           RESULT( 4 ) = BIGNUM
+                        END IF
+*
+                     END DO
+*
+*                    Print information about the tests that did not
+*                    pass the threshold.
+*
+                     DO T = 4, 4
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK',
+     $                        M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T,
+     $                        RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End test 4.
+*
+                  END IF
+*
+*                 Compute test 5:
+*
+*                 This test in only for matrix A with min(M,N) > 0.
+*
+*                 The test returns the ratio:
+*
+*                 1-norm(Q**T * B - Q**T * B ) /
+*                       ( M * EPS )
+*
+*                 (1) Compute B:=Q**T * B in the matrix B.
+*
+                  IF( MINMN.GT.0 ) THEN
+*
+                     LWORK_MQR = MAX(1, NRHS)
+                     CALL DORMQR( 'Left', 'Transpose',
+     $                         M, NRHS, KFACT, A, LDA, TAU, B, LDA,
+     $                         WORK, LWORK_MQR, INFO )
+*
+                     DO I = 1, NRHS
+*
+*                       Compare N+J-th column of A and J-column of B.
+*
+                        CALL DAXPY( M, -ONE, A( ( N+I-1 )*LDA+1 ), 1,
+     $                                 B( ( I-1 )*LDA+1 ), 1 )
+                     END DO
+*
+                   RESULT( 5 ) =
+     $               ABS(
+     $               DLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) /
+     $               ( DBLE( M )*DLAMCH( 'Epsilon' ) )
+     $               )
+*
+*                    Print information about the tests that did not pass
+*                    the threshold.
+*
+                     DO T = 5, 5
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End compute test 5.
+*
+                  END IF
+*
+*                 END DO KMAX = 1, MIN(M,N)+1
+*
+                  END DO
+*
+*                 END DO for INB = 1, NNB
+*
+               END DO
+*
+*              END DO  for IMAT = 1, NTYPES
+*
+               END DO
+*
+*              END DO for INS = 1, NNS
+*
+            END DO
+*
+*           END DO for IN = 1, NN
+*
+         END DO
+*
+*        END DO for IM = 1, NM
+*
+      END DO
+*
+*     Print a summary of the results.
+*
+      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
+*
+ 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5,
+     $        ', KMAX =', I5, ', ABSTOL =', G12.5,
+     $        ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4,
+     $        ', type ', I2, ', test ', I2, ', ratio =', G12.5 )
+*
+*     End of DCHKQP3RK
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f
index bc4e77a5a..1940351a4 100644
--- a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f
@@ -421,9 +421,9 @@
 *                 block factorization, LWORK is the length of AINV.
 *
                   SRNAMT = 'DSYTRF_AA_2STAGE'
-                  LWORK = MIN(N*NB, 3*NMAX*NMAX)
+                  LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX )
                   CALL DSYTRF_AA_2STAGE( UPLO, N, AFAC, LDA, 
-     $                                   AINV, (3*NB+1)*N, 
+     $                                   AINV, MAX( 1, (3*NB+1)*N ),
      $                                   IWORK, IWORK( 1+N ),
      $                                   WORK, LWORK,
      $                                   INFO )
@@ -503,7 +503,6 @@ c                  NT = 1
                      CALL DLACPY( 'Full', N, NRHS, B, LDA, X, LDA )
 *
                      SRNAMT = 'DSYTRS_AA_2STAGE'
-                     LWORK = MAX( 1, 3*N-2 )
                      CALL DSYTRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
      $                            AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ),
      $                            X, LDA, INFO )
diff --git a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f
index 91c9e8e9a..d04106ae3 100644
--- a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f
@@ -400,9 +400,9 @@
 *                    Factor the matrix and solve the system using DSYSV_AA.
 *
                      SRNAMT = 'DSYSV_AA_2STAGE '
-                     LWORK = MIN(N*NB, 3*NMAX*NMAX)
+                     LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX )
                      CALL DSYSV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
-     $                                 AINV, (3*NB+1)*N, 
+     $                                 AINV, MAX( 1, (3*NB+1)*N ),
      $                                 IWORK, IWORK( 1+N ),
      $                                 X, LDA, WORK, LWORK, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/dlatb4.f b/lapack-netlib/TESTING/LIN/dlatb4.f
index 8825d13e7..f3bccd45b 100644
--- a/lapack-netlib/TESTING/LIN/dlatb4.f
+++ b/lapack-netlib/TESTING/LIN/dlatb4.f
@@ -133,7 +133,7 @@
 *
 *     .. Parameters ..
       DOUBLE PRECISION   SHRINK, TENTH
-      PARAMETER          ( SHRINK = 0.25D0, TENTH = 0.1D+0 )
+      PARAMETER          ( SHRINK = 0.25D+0, TENTH = 0.1D+0 )
       DOUBLE PRECISION   ONE
       PARAMETER          ( ONE = 1.0D+0 )
       DOUBLE PRECISION   TWO
@@ -153,9 +153,6 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           DLABAD
-*     ..
 *     .. Save statement ..
       SAVE               EPS, SMALL, LARGE, BADC1, BADC2, FIRST
 *     ..
@@ -173,11 +170,6 @@
          BADC1 = SQRT( BADC2 )
          SMALL = DLAMCH( 'Safe minimum' )
          LARGE = ONE / SMALL
-*
-*        If it looks like we're on a Cray, take the square root of
-*        SMALL and LARGE to avoid overflow and underflow problems.
-*
-         CALL DLABAD( SMALL, LARGE )
          SMALL = SHRINK*( SMALL / EPS )
          LARGE = ONE / SMALL
       END IF
@@ -232,6 +224,110 @@
          ELSE
             ANORM = ONE
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        xQK: truncated QR with pivoting.
+*             Set parameters to generate a general
+*             M x N matrix.
+*
+*        Set TYPE, the type of matrix to be generated.  'N' is nonsymmetric.
+*
+         TYPE = 'N'
+*
+*        Set DIST, the type of distribution for the random
+*        number generator. 'S' is
+*
+         DIST = 'S'
+*
+*        Set the lower and upper bandwidths.
+*
+         IF( IMAT.EQ.2 ) THEN
+*
+*           2. Random, Diagonal, CNDNUM = 2
+*
+            KL = 0
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.3 ) THEN
+*
+*           3. Random, Upper triangular,  CNDNUM = 2
+*
+            KL = 0
+            KU = MAX( N-1, 0 )
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.4 ) THEN
+*
+*          4. Random, Lower triangular,  CNDNUM = 2
+*
+            KL = MAX( M-1, 0 )
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE
+*
+*           5.-19. Rectangular matrix
+*
+            KL = MAX( M-1, 0 )
+            KU = MAX( N-1, 0 )
+*
+            IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN
+*
+*              5.-14. Random, CNDNUM = 2.
+*
+               CNDNUM = TWO
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.15 ) THEN
+*
+*              15. Random, CNDNUM = sqrt(0.1/EPS)
+*
+               CNDNUM = BADC1
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.16 ) THEN
+*
+*              16. Random, CNDNUM = 0.1/EPS
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.17 ) THEN
+*
+*              17. Random, CNDNUM = 0.1/EPS,
+*                  one small singular value S(N)=1/CNDNUM
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 2
+*
+            ELSE IF( IMAT.EQ.18 ) THEN
+*
+*              18. Random, scaled near underflow
+*
+               CNDNUM = TWO
+               ANORM = SMALL
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.19 ) THEN
+*
+*              19. Random, scaled near overflow
+*
+               CNDNUM = TWO
+               ANORM = LARGE
+               MODE = 3
+*
+            END IF
+*
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN
 *
@@ -518,17 +614,18 @@
 *
 *        Set the norm and condition number.
 *
-         IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN
+         MAT = ABS( IMAT )
+         IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN
             CNDNUM = BADC1
-         ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN
+         ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN
             CNDNUM = BADC2
          ELSE
             CNDNUM = TWO
          END IF
 *
-         IF( IMAT.EQ.4 ) THEN
+         IF( MAT.EQ.4 ) THEN
             ANORM = SMALL
-         ELSE IF( IMAT.EQ.5 ) THEN
+         ELSE IF( MAT.EQ.5 ) THEN
             ANORM = LARGE
          ELSE
             ANORM = ONE
diff --git a/lapack-netlib/TESTING/LIN/dqpt01.f b/lapack-netlib/TESTING/LIN/dqpt01.f
index 8efbdc774..af3f5dd36 100644
--- a/lapack-netlib/TESTING/LIN/dqpt01.f
+++ b/lapack-netlib/TESTING/LIN/dqpt01.f
@@ -28,12 +28,13 @@
 *>
 *> DQPT01 tests the QR-factorization with pivoting of a matrix A.  The
 *> array AF contains the (possibly partial) QR-factorization of A, where
-*> the upper triangle of AF(1:k,1:k) is a partial triangular factor,
-*> the entries below the diagonal in the first k columns are the
+*> the upper triangle of AF(1:K,1:K) is a partial triangular factor,
+*> the entries below the diagonal in the first K columns are the
 *> Householder vectors, and the rest of AF contains a partially updated
 *> matrix.
 *>
-*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M)
+*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) ),
+*> where || . || is matrix one norm.
 *> \endverbatim
 *
 *  Arguments:
@@ -172,28 +173,41 @@
 *
       NORMA = DLANGE( 'One-norm', M, N, A, LDA, RWORK )
 *
-      DO 30 J = 1, K
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, K
+*
+*        Copy the upper triangular part of the factor R stored
+*        in AF(1:K,1:K) into the work array WORK.
+*
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = AF( I, J )
-   10    CONTINUE
-         DO 20 I = J + 1, M
+         END DO
+*
+*        Zero out the elements below the diagonal in the work array.
+*
+         DO I = J + 1, M
             WORK( ( J-1 )*M+I ) = ZERO
-   20    CONTINUE
-   30 CONTINUE
-      DO 40 J = K + 1, N
+         END DO
+      END DO
+*
+*     Copy columns (K+1,N) from AF into the work array WORK.
+*     AF(1:K,K+1:N) contains the rectangular block of the upper trapezoidal
+*     factor R, AF(K+1:M,K+1:N) contains the partially updated residual
+*     matrix of R.
+*
+      DO J = K + 1, N
          CALL DCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 )
-   40 CONTINUE
+      END DO
 *
       CALL DORMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK,
      $             M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
-      DO 50 J = 1, N
+      DO J = 1, N
 *
-*        Compare i-th column of QR and jpvt(i)-th column of A
+*        Compare J-th column of QR and JPVT(J)-th column of A.
 *
          CALL DAXPY( M, -ONE, A( 1, JPVT( J ) ), 1, WORK( ( J-1 )*M+1 ),
      $               1 )
-   50 CONTINUE
+      END DO
 *
       DQPT01 = DLANGE( 'One-norm', M, N, WORK, M, RWORK ) /
      $         ( DBLE( MAX( M, N ) )*DLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/dqrt11.f b/lapack-netlib/TESTING/LIN/dqrt11.f
index 33c7fab37..38bbeb822 100644
--- a/lapack-netlib/TESTING/LIN/dqrt11.f
+++ b/lapack-netlib/TESTING/LIN/dqrt11.f
@@ -157,9 +157,9 @@
       CALL DORM2R( 'Left', 'Transpose', M, M, K, A, LDA, TAU, WORK, M,
      $             WORK( M*M+1 ), INFO )
 *
-      DO 10 J = 1, M
+      DO J = 1, M
          WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE
-   10 CONTINUE
+      END DO
 *
       DQRT11 = DLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) /
      $         ( DBLE( M )*DLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/dqrt12.f b/lapack-netlib/TESTING/LIN/dqrt12.f
index 278e01bf0..b8a124c59 100644
--- a/lapack-netlib/TESTING/LIN/dqrt12.f
+++ b/lapack-netlib/TESTING/LIN/dqrt12.f
@@ -26,7 +26,7 @@
 *> DQRT12 computes the singular values `svlues' of the upper trapezoid
 *> of A(1:M,1:N) and returns the ratio
 *>
-*>      || s - svlues||/(||svlues||*eps*max(M,N))
+*>      || svlues - s ||/(||s||*eps*max(M,N))
 *> \endverbatim
 *
 *  Arguments:
@@ -113,8 +113,7 @@
       EXTERNAL           DASUM, DLAMCH, DLANGE, DNRM2
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DAXPY, DBDSQR, DGEBD2, DLABAD, DLASCL, DLASET,
-     $                   XERBLA
+      EXTERNAL           DAXPY, DBDSQR, DGEBD2, DLASCL, DLASET, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          DBLE, MAX, MIN
@@ -145,17 +144,16 @@
 *     Copy upper triangle of A into work
 *
       CALL DLASET( 'Full', M, N, ZERO, ZERO, WORK, M )
-      DO 20 J = 1, N
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, N
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = A( I, J )
-   10    CONTINUE
-   20 CONTINUE
+         END DO
+      END DO
 *
 *     Get machine parameters
 *
       SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale work if max entry outside range [SMLNUM,BIGNUM]
 *
@@ -199,16 +197,18 @@
 *
       ELSE
 *
-         DO 30 I = 1, MN
+         DO I = 1, MN
             WORK( M*N+I ) = ZERO
-   30    CONTINUE
+         END DO
       END IF
 *
 *     Compare s and singular values of work
 *
       CALL DAXPY( MN, -ONE, S, 1, WORK( M*N+1 ), 1 )
+*
       DQRT12 = DASUM( MN, WORK( M*N+1 ), 1 ) /
-     $         ( DLAMCH( 'Epsilon' )*DBLE( MAX( M, N ) ) )
+     $  ( DLAMCH('Epsilon') * DBLE( MAX( M, N ) ) )
+*
       IF( NRMSVL.NE.ZERO )
      $   DQRT12 = DQRT12 / NRMSVL
 *
diff --git a/lapack-netlib/TESTING/LIN/schkaa.F b/lapack-netlib/TESTING/LIN/schkaa.F
index a5b826d06..036b13924 100644
--- a/lapack-netlib/TESTING/LIN/schkaa.F
+++ b/lapack-netlib/TESTING/LIN/schkaa.F
@@ -63,6 +63,7 @@
 *> SLQ    8               List types on next line if 0 < NTYPES <  8
 *> SQL    8               List types on next line if 0 < NTYPES <  8
 *> SQP    6               List types on next line if 0 < NTYPES <  6
+*> DQK    19              List types on next line if 0 < NTYPES <  19
 *> STZ    3               List types on next line if 0 < NTYPES <  3
 *> SLS    6               List types on next line if 0 < NTYPES <  6
 *> SEQ
@@ -147,11 +148,11 @@
      $                   NBVAL( MAXIN ), NBVAL2( MAXIN ),
      $                   NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ),
      $                   RANKVAL( MAXIN ), PIV( NMAX )
-      REAL               E( NMAX ), S( 2*NMAX )
 *     ..
 *     .. Allocatable Arrays ..
       INTEGER AllocateStatus
-      REAL, DIMENSION(:), ALLOCATABLE :: RWORK
+      REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S
+      REAL, DIMENSION(:), ALLOCATABLE :: E
       REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK
 *     ..
 *     .. External Functions ..
@@ -162,13 +163,13 @@
 *     .. External Subroutines ..
       EXTERNAL           ALAREQ, SCHKEQ, SCHKGB, SCHKGE, SCHKGT, SCHKLQ,
      $                   SCHKORHR_COL, SCHKPB, SCHKPO, SCHKPS, SCHKPP,
-     $                   SCHKPT, SCHKQ3, SCHKQL, SCHKQR, SCHKRQ, SCHKSP,
-     $                   SCHKSY, SCHKSY_ROOK, SCHKSY_RK, SCHKSY_AA,
-     $                   SCHKTB, SCHKTP, SCHKTR, SCHKTZ, SDRVGB, SDRVGE,
-     $                   SDRVGT, SDRVLS, SDRVPB, SDRVPO, SDRVPP, SDRVPT,
-     $                   SDRVSP, SDRVSY, SDRVSY_ROOK, SDRVSY_RK,
-     $                   SDRVSY_AA, ILAVER, SCHKLQTP, SCHKQRT, SCHKQRTP,
-     $                   SCHKLQT, SCHKTSQR
+     $                   SCHKPT, SCHKQ3, SCHKQP3RK, SCHKQL, SCHKQR,
+     $                   SCHKRQ, SCHKSP, SCHKSY, SCHKSY_ROOK, SCHKSY_RK,
+     $                   SCHKSY_AA, SCHKTB, SCHKTP, SCHKTR, SCHKTZ,
+     $                   SDRVGB, SDRVGE, SDRVGT, SDRVLS, SDRVPB, SDRVPO,
+     $                   SDRVPP, SDRVPT, SDRVSP, SDRVSY, SDRVSY_ROOK,
+     $                   SDRVSY_RK, SDRVSY_AA, ILAVER, SCHKLQTP, SCHKQRT,
+     $                   SCHKQRTP, SCHKLQT, SCHKTSQR
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -188,13 +189,17 @@
 *     ..
 *     .. Allocate memory dynamically ..
 *
-      ALLOCATE (A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus )
+      ALLOCATE ( A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
-      ALLOCATE (B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus )
+      ALLOCATE ( B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
-      ALLOCATE (WORK( NMAX, NMAX+MAXRHS+30 ) , STAT = AllocateStatus )
+      ALLOCATE ( WORK( NMAX, 3*NMAX+MAXRHS+30 ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
-      ALLOCATE (RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus )
+      ALLOCATE ( E( NMAX ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" 
 *     ..
 *     .. Executable Statements ..
@@ -920,6 +925,23 @@
          ELSE
             WRITE( NOUT, FMT = 9989 )PATH
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        QK: truncated QR factorization with pivoting
+*
+         NTYPES = 19
+         CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
+*
+         IF( TSTCHK ) THEN
+            CALL SCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                      NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ),
+     $                      A( 1, 2 ), B( 1, 1 ), B( 1, 2 ),
+     $                      B( 1, 3 ), B( 1, 4 ),
+     $                      WORK, IWORK, NOUT )
+         ELSE
+            WRITE( NOUT, FMT = 9989 )PATH
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN
 *
@@ -1048,6 +1070,8 @@
 *
       DEALLOCATE (A, STAT = AllocateStatus)
       DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (E, STAT = AllocateStatus)
+      DEALLOCATE (S, STAT = AllocateStatus)
       DEALLOCATE (WORK, STAT = AllocateStatus)
       DEALLOCATE (RWORK,  STAT = AllocateStatus)
 *
diff --git a/lapack-netlib/TESTING/LIN/schkqp3rk.f b/lapack-netlib/TESTING/LIN/schkqp3rk.f
new file mode 100644
index 000000000..36cf9370e
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/schkqp3rk.f
@@ -0,0 +1,831 @@
+*> \brief \b SCHKQP3RK
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE SCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+*     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+*     $                      B, COPYB, S, TAU,
+*     $                      WORK, IWORK, NOUT )
+*      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+*      INTEGER            NM, NN, NNS, NNB, NOUT
+*      REAL               THRESH
+*     ..
+*     .. Array Arguments ..
+*      LOGICAL            DOTYPE( * )
+*      INTEGER            IWORK( * ), MVAL( * ), NBVAL( * ), NSVAL( * ),
+*     $                   NVAL( * ), NXVAL( * )
+*      REAL               A( * ), COPYA( * ), B( * ), COPYB( * ),
+*     $                   S( * ), TAU( * ), WORK( * )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SCHKQP3RK tests SGEQP3RK.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] DOTYPE
+*> \verbatim
+*>          DOTYPE is LOGICAL array, dimension (NTYPES)
+*>          The matrix types to be used for testing.  Matrices of type j
+*>          (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) =
+*>          .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used.
+*> \endverbatim
+*>
+*> \param[in] NM
+*> \verbatim
+*>          NM is INTEGER
+*>          The number of values of M contained in the vector MVAL.
+*> \endverbatim
+*>
+*> \param[in] MVAL
+*> \verbatim
+*>          MVAL is INTEGER array, dimension (NM)
+*>          The values of the matrix row dimension M.
+*> \endverbatim
+*>
+*> \param[in] NN
+*> \verbatim
+*>          NN is INTEGER
+*>          The number of values of N contained in the vector NVAL.
+*> \endverbatim
+*>
+*> \param[in] NVAL
+*> \verbatim
+*>          NVAL is INTEGER array, dimension (NN)
+*>          The values of the matrix column dimension N.
+*> \endverbatim
+*>
+*> \param[in] NNS
+*> \verbatim
+*>          NNS is INTEGER
+*>          The number of values of NRHS contained in the vector NSVAL.
+*> \endverbatim
+*>
+*> \param[in] NSVAL
+*> \verbatim
+*>          NSVAL is INTEGER array, dimension (NNS)
+*>          The values of the number of right hand sides NRHS.
+*> \endverbatim
+*>
+*> \param[in] NNB
+*> \verbatim
+*>          NNB is INTEGER
+*>          The number of values of NB and NX contained in the
+*>          vectors NBVAL and NXVAL.  The blocking parameters are used
+*>          in pairs (NB,NX).
+*> \endverbatim
+*>
+*> \param[in] NBVAL
+*> \verbatim
+*>          NBVAL is INTEGER array, dimension (NNB)
+*>          The values of the blocksize NB.
+*> \endverbatim
+*>
+*> \param[in] NXVAL
+*> \verbatim
+*>          NXVAL is INTEGER array, dimension (NNB)
+*>          The values of the crossover point NX.
+*> \endverbatim
+*>
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is REAL
+*>          The threshold value for the test ratios.  A result is
+*>          included in the output file if RESULT >= THRESH.  To have
+*>          every test ratio printed, use THRESH = 0.
+*> \endverbatim
+*>
+*> \param[out] A
+*> \verbatim
+*>          A is REAL array, dimension (MMAX*NMAX)
+*>          where MMAX is the maximum value of M in MVAL and NMAX is the
+*>          maximum value of N in NVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYA
+*> \verbatim
+*>          COPYA is REAL array, dimension (MMAX*NMAX)
+*> \endverbatim
+*>
+*> \param[out] B
+*> \verbatim
+*>          B is REAL array, dimension (MMAX*NSMAX)
+*>          where MMAX is the maximum value of M in MVAL and NSMAX is the
+*>          maximum value of NRHS in NSVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYB
+*> \verbatim
+*>          COPYB is REAL array, dimension (MMAX*NSMAX)
+*> \endverbatim
+*>
+*> \param[out] S
+*> \verbatim
+*>          S is REAL array, dimension
+*>                      (min(MMAX,NMAX))
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is REAL array, dimension (MMAX)
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension
+*>                      (MMAX*NMAX + 4*NMAX + MMAX)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (2*NMAX)
+*> \endverbatim
+*>
+*> \param[in] NOUT
+*> \verbatim
+*>          NOUT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup single_lin
+*
+*  =====================================================================
+      SUBROUTINE SCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+     $                      B, COPYB, S, TAU,
+     $                      WORK, IWORK, NOUT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            NM, NN, NNB, NNS, NOUT
+      REAL               THRESH
+*     ..
+*     .. Array Arguments ..
+      LOGICAL            DOTYPE( * )
+      INTEGER            IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ),
+     $                   NSVAL( * ), NXVAL( * )
+      REAL               A( * ), COPYA( * ), B( * ), COPYB( * ),
+     $                   S( * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NTYPES
+      PARAMETER          ( NTYPES = 19 )
+      INTEGER            NTESTS
+      PARAMETER          ( NTESTS = 5 )
+      REAL               ONE, ZERO, BIGNUM
+      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0,
+     $                     BIGNUM = 1.0E+38 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          DIST, TYPE
+      CHARACTER*3        PATH
+      INTEGER            I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO,
+     $                   INB, IND_OFFSET_GEN,
+     $                   IND_IN, IND_OUT, INS, INFO,
+     $                   ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO,
+     $                   KFACT, KL, KMAX, KU, LDA, LW, LWORK,
+     $                   LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N,
+     $                   NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS,
+     $                   NRUN, NX, T
+      REAL               ANORM, CNDNUM, EPS, ABSTOL, RELTOL,
+     $                   DTEMP, MAXC2NRMK, RELMAXC2NRMK
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 ), ISEEDY( 4 )
+      REAL               RESULT( NTESTS ), RDUMMY( 1 )
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH, SQPT01, SQRT11, SQRT12, SLANGE
+      EXTERNAL           SLAMCH, SQPT01, SQRT11, SQRT12, SLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAERH, ALAHD, ALASUM, SAXPY, SGEQP3RK,
+     $                   SLACPY, SLAORD, SLASET, SLATB4, SLATMS,
+     $                   SORMQR, SSWAP, ICOPY, XLAENV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, MOD, REAL
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER*32       SRNAMT
+      INTEGER            INFOT, IOUNIT
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, IOUNIT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA               ISEEDY / 1988, 1989, 1990, 1991 /
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize constants and the random number seed.
+*
+      PATH( 1: 1 ) = 'Single precision'
+      PATH( 2: 3 ) = 'QK'
+      NRUN = 0
+      NFAIL = 0
+      NERRS = 0
+      DO I = 1, 4
+         ISEED( I ) = ISEEDY( I )
+      END DO
+      EPS = SLAMCH( 'Epsilon' )
+      INFOT = 0
+*
+      DO IM = 1, NM
+*
+*        Do for each value of M in MVAL.
+*
+         M = MVAL( IM )
+         LDA = MAX( 1, M )
+*
+         DO IN = 1, NN
+*
+*           Do for each value of N in NVAL.
+*
+            N = NVAL( IN )
+            MINMN = MIN( M, N )
+            LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ),
+     $                   M*N + 2*MINMN + 4*N )
+*
+            DO INS = 1, NNS
+               NRHS = NSVAL( INS )
+*
+*                 Set up parameters with SLATB4 and generate
+*                 M-by-NRHS B matrix with SLATMS.
+*                 IMAT = 14:
+*                 Random matrix, CNDNUM = 2, NORM = ONE,
+*                 MODE = 3 (geometric distribution of singular values).
+*
+                  CALL SLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'SLATMS'
+                  CALL SLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYB, LDA, WORK, INFO )
+
+
+*
+*                 Check error code from SLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'SLATMS', INFO, 0, ' ', M,
+     $                            NRHS, -1, -1, -1, 6, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+               DO IMAT = 1, NTYPES
+*
+*              Do the tests only if DOTYPE( IMAT ) is true.
+*
+               IF( .NOT.DOTYPE( IMAT ) )
+     $            CYCLE
+*
+*              The type of distribution used to generate the random
+*              eigen-/singular values:
+*              ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 )
+*
+*           Do for each type of NON-SYMMETRIC matrix:                               CNDNUM                     NORM                                     MODE
+*            1. Zero matrix
+*            2. Random, Diagonal, CNDNUM = 2                                        CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            3. Random, Upper triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            4. Random, Lower triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            5. Random, First column is zero, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            6. Random, Last MINMN column is zero, CNDNUM = 2                       CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            7. Random, Last N column is zero, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            8. Random, Middle column in MINMN is zero, CNDNUM = 2                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            9. Random, First half of MINMN columns are zero, CNDNUM = 2            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2   CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           11. Random, Half MINMN columns in the middle are zero starting
+*                  from  MINMN/2-(MINMN/2)/2+1, CNDNUM = 2                          CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           12. Random, Odd columns are ZERO, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           13. Random, Even columns are ZERO, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           14. Random, CNDNUM = 2                                                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           15. Random, CNDNUM = sqrt(0.1/EPS)                                      CNDNUM = BADC1 = sqrt(0.1/EPS)  ONE                                      3 ( geometric distribution of singular values )
+*           16. Random, CNDNUM = 0.1/EPS                                            CNDNUM = BADC2 = 0.1/EPS        ONE                                      3 ( geometric distribution of singular values )
+*           17. Random, CNDNUM = 0.1/EPS,                                           CNDNUM = BADC2 = 0.1/EPS        ONE                                      2 ( one small singular value, S(N)=1/CNDNUM )
+*                 one small singular value S(N)=1/CNDNUM
+*           18. Random, CNDNUM = 2, scaled near underflow                           CNDNUM = 2                      SMALL = SAFMIN
+*           19. Random, CNDNUM = 2, scaled near overflow                            CNDNUM = 2                      LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) )  3 ( geometric distribution of singular values )
+*
+               IF( IMAT.EQ.1 ) THEN
+*
+*                 Matrix 1: Zero matrix
+*
+                  CALL SLASET( 'Full', M, N, ZERO, ZERO, COPYA, LDA )
+                  DO I = 1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 )
+     $                  .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN
+*
+*                 Matrices 2-5.
+*
+*                 Set up parameters with SLATB4 and generate a test
+*                 matrix with SLATMS.
+*
+                  CALL SLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'SLATMS'
+                  CALL SLATMS( M, N, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYA, LDA, WORK, INFO )
+*
+*                 Check error code from SLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'SLATMS', INFO, 0, ' ', M, N,
+     $                            -1, -1, -1, IMAT, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+                  CALL SLAORD( 'Decreasing', MINMN, S, 1 )
+*
+               ELSE IF( MINMN.GE.2
+     $                  .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN
+*
+*                 Rectangular matrices 5-13 that contain zero columns,
+*                 only for matrices MINMN >=2.
+*
+*                 JB_ZERO is the column index of ZERO block.
+*                 NB_ZERO is the column block size of ZERO block.
+*                 NB_GEN is the column blcok size of the
+*                 generated block.
+*                 J_INC in the non_zero column index increment
+*                 for matrix 12 and 13.
+*                 J_FIRS_NZ is the index of the first non-zero
+*                 column.
+*
+                  IF( IMAT.EQ.5 ) THEN
+*
+*                    First column is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.6 ) THEN
+*
+*                    Last column MINMN is zero.
+*
+                     JB_ZERO = MINMN
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.7 ) THEN
+*
+*                    Last column N is zero.
+*
+                     JB_ZERO = N
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.8 ) THEN
+*
+*                    Middle column in MINMN is zero.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.9 ) THEN
+*
+*                    First half of MINMN columns is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.10 ) THEN
+*
+*                    Last columns are zero columns,
+*                    starting from (MINMN / 2 + 1) column.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = N - JB_ZERO + 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.11 ) THEN
+*
+*                    Half of the columns in the middle of MINMN
+*                    columns is zero, starting from
+*                    MINMN/2 - (MINMN/2)/2 + 1 column.
+*
+                     JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.12 ) THEN
+*
+*                    Odd-numbered columns are zero,
+*
+                     NB_GEN = N / 2
+                     NB_ZERO = N - NB_GEN
+                     J_INC = 2
+                     J_FIRST_NZ = 2
+*
+                  ELSE IF( IMAT.EQ.13 ) THEN
+*
+*                    Even-numbered columns are zero.
+*
+                     NB_ZERO = N / 2
+                     NB_GEN = N - NB_ZERO
+                     J_INC = 2
+                     J_FIRST_NZ = 1
+*
+                  END IF
+*
+*
+*                 1) Set the first NB_ZERO columns in COPYA(1:M,1:N)
+*                    to zero.
+*
+                  CALL SLASET( 'Full', M, NB_ZERO, ZERO, ZERO,
+     $                         COPYA, LDA )
+*
+*                    2) Generate an M-by-(N-NB_ZERO) matrix with the
+*                       chosen singular value distribution
+*                       in COPYA(1:M,NB_ZERO+1:N).
+*
+                  CALL SLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU,
+     $                         ANORM, MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'SLATMS'
+*
+                  IND_OFFSET_GEN = NB_ZERO * LDA
+*
+                  CALL SLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE,
+     $                        CNDNUM, ANORM, KL, KU, 'No packing',
+     $                        COPYA( IND_OFFSET_GEN + 1 ), LDA,
+     $                        WORK, INFO )
+*
+*                 Check error code from SLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'SLATMS', INFO, 0, ' ', M,
+     $                            NB_GEN, -1, -1, -1, IMAT, NFAIL,
+     $                            NERRS, NOUT )
+                     CYCLE
+                  END IF
+*
+*                 3) Swap the gererated colums from the right side
+*                 NB_GEN-size block in COPYA into correct column
+*                 positions.
+*
+                  IF( IMAT.EQ.6
+     $                    .OR. IMAT.EQ.7
+     $                    .OR. IMAT.EQ.8
+     $                    .OR. IMAT.EQ.10
+     $                    .OR. IMAT.EQ.11 ) THEN
+*
+*                    Move by swapping the generated columns
+*                    from the right NB_GEN-size block from
+*                    (NB_ZERO+1:NB_ZERO+JB_ZERO)
+*                    into columns (1:JB_ZERO-1).
+*
+                     DO J = 1, JB_ZERO-1, 1
+                        CALL SSWAP( M,
+     $                        COPYA( ( NB_ZERO+J-1)*LDA+1), 1,
+     $                        COPYA( (J-1)*LDA + 1 ), 1 )
+                     END DO
+*
+                  ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN
+*
+*                    ( IMAT = 12, Odd-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the even zero colums in the
+*                    left NB_ZERO-size block.
+*
+*                    ( IMAT = 13, Even-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the odd zero colums in the
+*                    left NB_ZERO-size block.
+*
+                     DO J = 1, NB_GEN, 1
+                        IND_OUT = ( NB_ZERO+J-1 )*LDA + 1
+                        IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA
+     $                            + 1
+                        CALL SSWAP( M,
+     $                              COPYA( IND_OUT ), 1,
+     $                              COPYA( IND_IN), 1 )
+                        END DO
+*
+                  END IF
+*
+*                 5) Order the singular values generated by
+*                    DLAMTS in decreasing order and add trailing zeros
+*                    that correspond to zero columns.
+*                    The total number of singular values is MINMN.
+*
+                  MINMNB_GEN = MIN( M, NB_GEN )
+*
+                  DO I = MINMNB_GEN+1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE
+*
+*                    IF(MINMN.LT.2) skip this size for this matrix type.
+*
+                     CYCLE
+               END IF
+*
+*              Initialize a copy array for a pivot array for SGEQP3RK.
+*
+               DO I = 1, N
+                  IWORK( I ) = 0
+               END DO
+*
+               DO INB = 1, NNB
+*
+*                 Do for each pair of values (NB,NX) in NBVAL and NXVAL.
+*
+                  NB = NBVAL( INB )
+                  CALL XLAENV( 1, NB )
+                  NX = NXVAL( INB )
+                  CALL XLAENV( 3, NX )
+*
+*                 We do MIN(M,N)+1 because we need a test for KMAX > N,
+*                 when KMAX is larger than MIN(M,N), KMAX should be
+*                 KMAX = MIN(M,N)
+*
+                  DO KMAX = 0, MIN(M,N)+1
+*
+*                 Get a working copy of COPYA into A( 1:M,1:N ).
+*                 Get a working copy of COPYB into A( 1:M, (N+1):NRHS ).
+*                 Get a working copy of COPYB into into B( 1:M, 1:NRHS ).
+*                 Get a working copy of IWORK(1:N) awith zeroes into
+*                 which is going to be used as pivot array IWORK( N+1:2N ).
+*                 NOTE: IWORK(2N+1:3N) is going to be used as a WORK array
+*                 for the routine.
+*
+                  CALL SLACPY( 'All', M, N, COPYA, LDA, A, LDA )
+                  CALL SLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         A( LDA*N + 1 ),  LDA )
+                  CALL SLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         B,  LDA )
+                  CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 )
+*
+                  ABSTOL = -1.0
+                  RELTOL = -1.0
+*
+*                 Compute the QR factorization with pivoting of A
+*
+                  LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ),
+     $                              3*N + NRHS - 1 ) )
+*
+*                 Compute SGEQP3RK factorization of A.
+*
+                  SRNAMT = 'SGEQP3RK'
+                  CALL SGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                           A, LDA, KFACT, MAXC2NRMK,
+     $                           RELMAXC2NRMK, IWORK( N+1 ), TAU,
+     $                           WORK, LW, IWORK( 2*N+1 ), INFO )
+*
+*                 Check error code from SGEQP3RK.
+*
+                  IF( INFO.LT.0 )
+     $               CALL ALAERH( PATH, 'SGEQP3RK', INFO, 0, ' ',
+     $                            M, N, NX, -1, NB, IMAT,
+     $                            NFAIL, NERRS, NOUT )
+*
+*                 Compute test 1:
+*
+*                 This test in only for the full rank factorization of
+*                 the matrix A.
+*
+*                 Array S(1:min(M,N)) contains svd(A) the sigular values
+*                 of the original matrix A in decreasing absolute value
+*                 order. The test computes svd(R), the vector sigular
+*                 values of the upper trapezoid of A(1:M,1:N) that
+*                 contains the factor R, in decreasing order. The test
+*                 returns the ratio:
+*
+*                 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS )
+*
+                  IF( KFACT.EQ.MINMN ) THEN
+*
+                     RESULT( 1 ) = SQRT12( M, N, A, LDA, S, WORK,
+     $                                     LWORK )
+*
+                     DO T = 1, 1
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL, NB, NX,
+     $                        IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                   End test 1
+*
+                  END IF
+*
+*                 Compute test 2:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS )
+*
+                  RESULT( 2 ) = SQPT01( M, N, KFACT, COPYA, A, LDA, TAU,
+     $                          IWORK( N+1 ), WORK, LWORK )
+*
+*                 Compute test 3:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( Q**T * Q - I ) / ( M * EPS )
+*
+                  RESULT( 3 ) = SQRT11( M, KFACT, A, LDA, TAU, WORK,
+     $                          LWORK )
+*
+*                 Print information about the tests that did not pass
+*                 the threshold.
+*
+                  DO T = 2, 3
+                     IF( RESULT( T ).GE.THRESH ) THEN
+                        IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                     CALL ALAHD( NOUT, PATH )
+                        WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', M, N,
+     $                      NRHS, KMAX, ABSTOL, RELTOL,
+     $                      NB, NX, IMAT, T, RESULT( T )
+                        NFAIL = NFAIL + 1
+                     END IF
+                  END DO
+                  NRUN = NRUN + 2
+*
+*                 Compute test 4:
+*
+*                 This test is only for the factorizations with the
+*                 rank greater than 2.
+*                 The elements on the diagonal of R should be non-
+*                 increasing.
+*
+*                 The test returns the ratio:
+*
+*                 Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)),
+*                 K=1:KFACT-1
+*
+                  IF( MIN(KFACT, MINMN).GE.2 ) THEN
+*
+                     DO J = 1, KFACT-1, 1
+
+                        DTEMP = (( ABS( A( (J-1)*M+J ) ) -
+     $                          ABS( A( (J)*M+J+1 ) ) ) /
+     $                          ABS( A(1) ) )
+*
+                        IF( DTEMP.LT.ZERO ) THEN
+                           RESULT( 4 ) = BIGNUM
+                        END IF
+*
+                     END DO
+*
+*                    Print information about the tests that did not
+*                    pass the threshold.
+*
+                     DO T = 4, 4
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK',
+     $                        M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T,
+     $                        RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End test 4.
+*
+                  END IF
+*
+*                 Compute test 5:
+*
+*                 This test in only for matrix A with min(M,N) > 0.
+*
+*                 The test returns the ratio:
+*
+*                 1-norm(Q**T * B - Q**T * B ) /
+*                       ( M * EPS )
+*
+*                 (1) Compute B:=Q**T * B in the matrix B.
+*
+                  IF( MINMN.GT.0 ) THEN
+*
+                     LWORK_MQR = MAX(1, NRHS)
+                     CALL SORMQR( 'Left', 'Transpose',
+     $                         M, NRHS, KFACT, A, LDA, TAU, B, LDA,
+     $                         WORK, LWORK_MQR, INFO )
+*
+                     DO I = 1, NRHS
+*
+*                       Compare N+J-th column of A and J-column of B.
+*
+                        CALL SAXPY( M, -ONE, A( ( N+I-1 )*LDA+1 ), 1,
+     $                                 B( ( I-1 )*LDA+1 ), 1 )
+                     END DO
+*
+                   RESULT( 5 ) =
+     $               ABS(
+     $               SLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) /
+     $               ( REAL( M )*SLAMCH( 'Epsilon' ) )
+     $               )
+*
+*                    Print information about the tests that did not pass
+*                    the threshold.
+*
+                     DO T = 5, 5
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End compute test 5.
+*
+                  END IF
+*
+*                 END DO KMAX = 1, MIN(M,N)+1
+*
+                  END DO
+*
+*                 END DO for INB = 1, NNB
+*
+               END DO
+*
+*              END DO  for IMAT = 1, NTYPES
+*
+               END DO
+*
+*              END DO for INS = 1, NNS
+*
+            END DO
+*
+*           END DO for IN = 1, NN
+*
+         END DO
+*
+*        END DO for IM = 1, NM
+*
+      END DO
+*
+*     Print a summary of the results.
+*
+      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
+*
+ 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5,
+     $        ', KMAX =', I5, ', ABSTOL =', G12.5,
+     $        ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4,
+     $        ', type ', I2, ', test ', I2, ', ratio =', G12.5 )
+*
+*     End of SCHKQP3RK
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f
index d3c27ae56..6490cd7c3 100644
--- a/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f
@@ -423,9 +423,9 @@
 *                 block factorization, LWORK is the length of AINV.
 *
                   SRNAMT = 'SSYTRF_AA_2STAGE'
-                  LWORK = MIN(N*NB, 3*NMAX*NMAX)
+                  LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX )
                   CALL SSYTRF_AA_2STAGE( UPLO, N, AFAC, LDA, 
-     $                                   AINV, (3*NB+1)*N, 
+     $                                   AINV, MAX( 1, (3*NB+1)*N ),
      $                                   IWORK, IWORK( 1+N ),
      $                                   WORK, LWORK,
      $                                   INFO )
@@ -505,7 +505,6 @@
                      CALL SLACPY( 'Full', N, NRHS, B, LDA, X, LDA )
 *
                      SRNAMT = 'SSYTRS_AA_2STAGE'
-                     LWORK = MAX( 1, 3*N-2 )
                      CALL SSYTRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
      $                            AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ),
      $                            X, LDA, INFO )
diff --git a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f
index aff32bce9..319b90805 100644
--- a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f
@@ -400,9 +400,9 @@
 *                    Factor the matrix and solve the system using SSYSV_AA.
 *
                      SRNAMT = 'SSYSV_AA_2STAGE '
-                     LWORK = MIN(N*NB, 3*NMAX*NMAX)
+                     LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX )
                      CALL SSYSV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
-     $                                 AINV, (3*NB+1)*N, 
+     $                                 AINV, MAX( 1, (3*NB+1)*N ),
      $                                 IWORK, IWORK( 1+N ),
      $                                 X, LDA, WORK, LWORK, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/slatb4.f b/lapack-netlib/TESTING/LIN/slatb4.f
index 94d29db40..72a310727 100644
--- a/lapack-netlib/TESTING/LIN/slatb4.f
+++ b/lapack-netlib/TESTING/LIN/slatb4.f
@@ -153,9 +153,6 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           SLABAD
-*     ..
 *     .. Save statement ..
       SAVE               EPS, SMALL, LARGE, BADC1, BADC2, FIRST
 *     ..
@@ -173,11 +170,6 @@
          BADC1 = SQRT( BADC2 )
          SMALL = SLAMCH( 'Safe minimum' )
          LARGE = ONE / SMALL
-*
-*        If it looks like we're on a Cray, take the square root of
-*        SMALL and LARGE to avoid overflow and underflow problems.
-*
-         CALL SLABAD( SMALL, LARGE )
          SMALL = SHRINK*( SMALL / EPS )
          LARGE = ONE / SMALL
       END IF
@@ -232,6 +224,110 @@
          ELSE
             ANORM = ONE
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        xQK: truncated QR with pivoting.
+*             Set parameters to generate a general
+*             M x N matrix.
+*
+*        Set TYPE, the type of matrix to be generated.  'N' is nonsymmetric.
+*
+         TYPE = 'N'
+*
+*        Set DIST, the type of distribution for the random
+*        number generator. 'S' is
+*
+         DIST = 'S'
+*
+*        Set the lower and upper bandwidths.
+*
+         IF( IMAT.EQ.2 ) THEN
+*
+*           2. Random, Diagonal, CNDNUM = 2
+*
+            KL = 0
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.3 ) THEN
+*
+*           3. Random, Upper triangular,  CNDNUM = 2
+*
+            KL = 0
+            KU = MAX( N-1, 0 )
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.4 ) THEN
+*
+*          4. Random, Lower triangular,  CNDNUM = 2
+*
+            KL = MAX( M-1, 0 )
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE
+*
+*           5.-19. Rectangular matrix
+*
+            KL = MAX( M-1, 0 )
+            KU = MAX( N-1, 0 )
+*
+            IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN
+*
+*              5.-14. Random, CNDNUM = 2.
+*
+               CNDNUM = TWO
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.15 ) THEN
+*
+*              15. Random, CNDNUM = sqrt(0.1/EPS)
+*
+               CNDNUM = BADC1
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.16 ) THEN
+*
+*              16. Random, CNDNUM = 0.1/EPS
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.17 ) THEN
+*
+*              17. Random, CNDNUM = 0.1/EPS,
+*                  one small singular value S(N)=1/CNDNUM
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 2
+*
+            ELSE IF( IMAT.EQ.18 ) THEN
+*
+*              18. Random, scaled near underflow
+*
+               CNDNUM = TWO
+               ANORM = SMALL
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.19 ) THEN
+*
+*              19. Random, scaled near overflow
+*
+               CNDNUM = TWO
+               ANORM = LARGE
+               MODE = 3
+*
+            END IF
+*
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN
 *
@@ -518,17 +614,18 @@
 *
 *        Set the norm and condition number.
 *
-         IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN
+         MAT = ABS( IMAT )
+         IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN
             CNDNUM = BADC1
-         ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN
+         ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN
             CNDNUM = BADC2
          ELSE
             CNDNUM = TWO
          END IF
 *
-         IF( IMAT.EQ.4 ) THEN
+         IF( MAT.EQ.4 ) THEN
             ANORM = SMALL
-         ELSE IF( IMAT.EQ.5 ) THEN
+         ELSE IF( MAT.EQ.5 ) THEN
             ANORM = LARGE
          ELSE
             ANORM = ONE
diff --git a/lapack-netlib/TESTING/LIN/sqpt01.f b/lapack-netlib/TESTING/LIN/sqpt01.f
index de0c80e53..f53686a65 100644
--- a/lapack-netlib/TESTING/LIN/sqpt01.f
+++ b/lapack-netlib/TESTING/LIN/sqpt01.f
@@ -33,7 +33,8 @@
 *> Householder vectors, and the rest of AF contains a partially updated
 *> matrix.
 *>
-*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M)
+*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) )
+*> where || . || is matrix one norm.
 *> \endverbatim
 *
 *  Arguments:
@@ -172,28 +173,28 @@
 *
       NORMA = SLANGE( 'One-norm', M, N, A, LDA, RWORK )
 *
-      DO 30 J = 1, K
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, K
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = AF( I, J )
-   10    CONTINUE
-         DO 20 I = J + 1, M
+         END DO
+         DO I = J + 1, M
             WORK( ( J-1 )*M+I ) = ZERO
-   20    CONTINUE
-   30 CONTINUE
-      DO 40 J = K + 1, N
+         END DO
+      END DO
+      DO J = K + 1, N
          CALL SCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 )
-   40 CONTINUE
+      END DO
 *
       CALL SORMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK,
      $             M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
-      DO 50 J = 1, N
+      DO J = 1, N
 *
 *        Compare i-th column of QR and jpvt(i)-th column of A
 *
          CALL SAXPY( M, -ONE, A( 1, JPVT( J ) ), 1, WORK( ( J-1 )*M+1 ),
      $               1 )
-   50 CONTINUE
+      END DO
 *
       SQPT01 = SLANGE( 'One-norm', M, N, WORK, M, RWORK ) /
      $         ( REAL( MAX( M, N ) )*SLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/sqrt11.f b/lapack-netlib/TESTING/LIN/sqrt11.f
index d4422dacb..a3753adcf 100644
--- a/lapack-netlib/TESTING/LIN/sqrt11.f
+++ b/lapack-netlib/TESTING/LIN/sqrt11.f
@@ -157,9 +157,9 @@
       CALL SORM2R( 'Left', 'Transpose', M, M, K, A, LDA, TAU, WORK, M,
      $             WORK( M*M+1 ), INFO )
 *
-      DO 10 J = 1, M
+      DO J = 1, M
          WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE
-   10 CONTINUE
+      END DO
 *
       SQRT11 = SLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) /
      $         ( REAL( M )*SLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/sqrt12.f b/lapack-netlib/TESTING/LIN/sqrt12.f
index 2eab0ee0d..46b359e07 100644
--- a/lapack-netlib/TESTING/LIN/sqrt12.f
+++ b/lapack-netlib/TESTING/LIN/sqrt12.f
@@ -26,7 +26,7 @@
 *> SQRT12 computes the singular values `svlues' of the upper trapezoid
 *> of A(1:M,1:N) and returns the ratio
 *>
-*>      || s - svlues||/(||svlues||*eps*max(M,N))
+*>      || svlues - s ||/(||s||*eps*max(M,N))
 *> \endverbatim
 *
 *  Arguments:
@@ -113,8 +113,7 @@
       EXTERNAL           SASUM, SLAMCH, SLANGE, SNRM2
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SAXPY, SBDSQR, SGEBD2, SLABAD, SLASCL, SLASET,
-     $                   XERBLA
+      EXTERNAL           SAXPY, SBDSQR, SGEBD2, SLASCL, SLASET, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN, REAL
@@ -145,17 +144,16 @@
 *     Copy upper triangle of A into work
 *
       CALL SLASET( 'Full', M, N, ZERO, ZERO, WORK, M )
-      DO 20 J = 1, N
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, N
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = A( I, J )
-   10    CONTINUE
-   20 CONTINUE
+         END DO
+      END DO
 *
 *     Get machine parameters
 *
       SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale work if max entry outside range [SMLNUM,BIGNUM]
 *
@@ -199,9 +197,9 @@
 *
       ELSE
 *
-         DO 30 I = 1, MN
+         DO I = 1, MN
             WORK( M*N+I ) = ZERO
-   30    CONTINUE
+         END DO
       END IF
 *
 *     Compare s and singular values of work
diff --git a/lapack-netlib/TESTING/LIN/zchkaa.F b/lapack-netlib/TESTING/LIN/zchkaa.F
index a118515a5..f1020f2d8 100644
--- a/lapack-netlib/TESTING/LIN/zchkaa.F
+++ b/lapack-netlib/TESTING/LIN/zchkaa.F
@@ -69,6 +69,7 @@
 *> ZLQ    8               List types on next line if 0 < NTYPES <  8
 *> ZQL    8               List types on next line if 0 < NTYPES <  8
 *> ZQP    6               List types on next line if 0 < NTYPES <  6
+*> ZQK   19               List types on next line if 0 < NTYPES < 19
 *> ZTZ    3               List types on next line if 0 < NTYPES <  3
 *> ZLS    6               List types on next line if 0 < NTYPES <  6
 *> ZEQ
@@ -153,12 +154,11 @@
      $                   NBVAL( MAXIN ), NBVAL2( MAXIN ),
      $                   NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ),
      $                   RANKVAL( MAXIN ), PIV( NMAX )
-      DOUBLE PRECISION   S( 2*NMAX )
-      COMPLEX*16         E( NMAX )
-*
-*    .. Allocatable Arrays ..
+*     ..
+*     .. Allocatable Arrays ..
       INTEGER AllocateStatus
-      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE::   RWORK
+      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE::  RWORK, S
+      COMPLEX*16, DIMENSION(:), ALLOCATABLE :: E
       COMPLEX*16, DIMENSION(:,:), ALLOCATABLE::  A, B, WORK
 *     ..
 *     .. External Functions ..
@@ -170,15 +170,16 @@
       EXTERNAL           ALAREQ, ZCHKEQ, ZCHKGB, ZCHKGE, ZCHKGT, ZCHKHE,
      $                   ZCHKHE_ROOK, ZCHKHE_RK, ZCHKHE_AA, ZCHKHP,
      $                   ZCHKLQ, ZCHKUNHR_COL, ZCHKPB, ZCHKPO, ZCHKPS,
-     $                   ZCHKPP, ZCHKPT, ZCHKQ3, ZCHKQL, ZCHKQR, ZCHKRQ,
-     $                   ZCHKSP, ZCHKSY, ZCHKSY_ROOK, ZCHKSY_RK,
-     $                   ZCHKSY_AA, ZCHKTB, ZCHKTP, ZCHKTR, ZCHKTZ,
-     $                   ZDRVGB, ZDRVGE, ZDRVGT, ZDRVHE, ZDRVHE_ROOK,
-     $                   ZDRVHE_RK, ZDRVHE_AA, ZDRVHE_AA_2STAGE, ZDRVHP,
-     $                   ZDRVLS, ZDRVPB,  ZDRVPO, ZDRVPP, ZDRVPT,
-     $                   ZDRVSP, ZDRVSY, ZDRVSY_ROOK, ZDRVSY_RK,
-     $                   ZDRVSY_AA, ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT,
-     $                   ZCHKQRTP, ZCHKLQT, ZCHKLQTP, ZCHKTSQR
+     $                   ZCHKPP, ZCHKPT, ZCHKQ3, ZCHKQP3RK, ZCHKQL,
+     $                   ZCHKQR, ZCHKRQ, ZCHKSP, ZCHKSY, ZCHKSY_ROOK,
+     $                   ZCHKSY_RK, ZCHKSY_AA, ZCHKTB, ZCHKTP, ZCHKTR,
+     $                   ZCHKTZ, ZDRVGB, ZDRVGE, ZDRVGT, ZDRVHE,
+     $                   ZDRVHE_ROOK, ZDRVHE_RK, ZDRVHE_AA,
+     $                   ZDRVHE_AA_2STAGE, ZDRVHP, ZDRVLS, ZDRVPB,
+     $                   ZDRVPO, ZDRVPP, ZDRVPT, ZDRVSP, ZDRVSY,
+     $                   ZDRVSY_ROOK, ZDRVSY_RK, ZDRVSY_AA,
+     $                   ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT, ZCHKQRTP,
+     $                   ZCHKLQT, ZCHKLQTP, ZCHKTSQR
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -197,13 +198,18 @@
       DATA               THREQ / 2.0D0 / , INTSTR / '0123456789' /
 *
 *     .. Allocate memory dynamically ..
-      ALLOCATE (RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus)
+*
+      ALLOCATE ( A ( (KDMAX+1) * NMAX, 7 ), STAT = AllocateStatus)
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( B ( NMAX * MAXRHS, 4 ), STAT = AllocateStatus)
+      IF (AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( WORK ( NMAX, NMAX+MAXRHS+10 ), STAT = AllocateStatus)
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
-      ALLOCATE (A ((KDMAX+1) * NMAX, 7), STAT = AllocateStatus)
+      ALLOCATE ( E( NMAX ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
-      ALLOCATE (B (NMAX * MAXRHS, 4), STAT = AllocateStatus)
+      ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus)
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
-      ALLOCATE (WORK (NMAX, NMAX+MAXRHS+10), STAT = AllocateStatus)
+      ALLOCATE ( RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus)
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
 *     ..
 *     .. Executable Statements ..
@@ -1109,6 +1115,23 @@
          ELSE
             WRITE( NOUT, FMT = 9989 )PATH
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        QK: truncated QR factorization with pivoting
+*
+         NTYPES = 19
+         CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
+*
+         IF( TSTCHK ) THEN
+            CALL ZCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                   NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ),
+     $                   A( 1, 2 ), B( 1, 1 ), B( 1, 2 ),
+     $                   S( 1 ), B( 1, 4 ),
+     $                   WORK, RWORK, IWORK, NOUT )
+         ELSE
+            WRITE( NOUT, FMT = 9989 )PATH
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN
 *
@@ -1245,6 +1268,8 @@
 *
       DEALLOCATE (A, STAT = AllocateStatus)
       DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (E, STAT = AllocateStatus)
+      DEALLOCATE (S, STAT = AllocateStatus)
       DEALLOCATE (RWORK, STAT = AllocateStatus)
       DEALLOCATE (WORK,  STAT = AllocateStatus)
 *
diff --git a/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f
index 381fac9f2..51082f1d0 100644
--- a/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f
@@ -8,7 +8,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB, NBVAL, 
+*       SUBROUTINE ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB, NBVAL,
 *                             NNS, NSVAL, THRESH, TSTERR, NMAX, A,
 *                             AFAC, AINV, B, X, XACT, WORK, RWORK,
 *                             IWORK, NOUT )
@@ -185,7 +185,8 @@
       LOGICAL            DOTYPE( * )
       INTEGER            IWORK( * ), NBVAL( * ), NSVAL( * ), NVAL( * )
       COMPLEX*16         A( * ), AFAC( * ), AINV( * ), B( * ),
-     $                   RWORK( * ), WORK( * ), X( * ), XACT( * )
+     $                   WORK( * ), X( * ), XACT( * )
+      DOUBLE PRECISION   RWORK( * )
 *     ..
 *
 *  =====================================================================
@@ -430,9 +431,9 @@
 *                 block factorization, LWORK is the length of AINV.
 *
                   SRNAMT = 'ZHETRF_AA_2STAGE'
-                  LWORK = MIN(N*NB, 3*NMAX*NMAX)
-                  CALL ZHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, 
-     $                                   AINV, (3*NB+1)*N, 
+                  LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX )
+                  CALL ZHETRF_AA_2STAGE( UPLO, N, AFAC, LDA,
+     $                                   AINV, MAX( 1, (3*NB+1)*N ),
      $                                   IWORK, IWORK( 1+N ),
      $                                   WORK, LWORK,
      $                                   INFO )
diff --git a/lapack-netlib/TESTING/LIN/zchkqp3rk.f b/lapack-netlib/TESTING/LIN/zchkqp3rk.f
new file mode 100644
index 000000000..302c7b1a8
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/zchkqp3rk.f
@@ -0,0 +1,836 @@
+*> \brief \b ZCHKQP3RK
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE ZCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+*     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+*     $                      B, COPYB, S, TAU,
+*     $                      WORK, RWORK, IWORK, NOUT )
+*      IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER            NM, NN, NNB, NOUT
+*       DOUBLE PRECISION   THRESH
+*       ..
+*       .. Array Arguments ..
+*       LOGICAL            DOTYPE( * )
+*       INTEGER            IWORK( * ), MVAL( * ), NBVAL( * ), NVAL( * ),
+*      $                   NXVAL( * )
+*       DOUBLE PRECISION   S( * ), RWORK( * )
+*       COMPLEX*16         A( * ), COPYA( * ), TAU( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZCHKQP3RK tests ZGEQP3RK.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] DOTYPE
+*> \verbatim
+*>          DOTYPE is LOGICAL array, dimension (NTYPES)
+*>          The matrix types to be used for testing.  Matrices of type j
+*>          (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) =
+*>          .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used.
+*> \endverbatim
+*>
+*> \param[in] NM
+*> \verbatim
+*>          NM is INTEGER
+*>          The number of values of M contained in the vector MVAL.
+*> \endverbatim
+*>
+*> \param[in] MVAL
+*> \verbatim
+*>          MVAL is INTEGER array, dimension (NM)
+*>          The values of the matrix row dimension M.
+*> \endverbatim
+*>
+*> \param[in] NN
+*> \verbatim
+*>          NN is INTEGER
+*>          The number of values of N contained in the vector NVAL.
+*> \endverbatim
+*>
+*> \param[in] NVAL
+*> \verbatim
+*>          NVAL is INTEGER array, dimension (NN)
+*>          The values of the matrix column dimension N.
+*> \endverbatim
+*>
+*> \param[in] NNS
+*> \verbatim
+*>          NNS is INTEGER
+*>          The number of values of NRHS contained in the vector NSVAL.
+*> \endverbatim
+*>
+*> \param[in] NSVAL
+*> \verbatim
+*>          NSVAL is INTEGER array, dimension (NNS)
+*>          The values of the number of right hand sides NRHS.
+*> \endverbatim
+*> \param[in] NNB
+*> \verbatim
+*>          NNB is INTEGER
+*>          The number of values of NB and NX contained in the
+*>          vectors NBVAL and NXVAL.  The blocking parameters are used
+*>          in pairs (NB,NX).
+*> \endverbatim
+*>
+*> \param[in] NBVAL
+*> \verbatim
+*>          NBVAL is INTEGER array, dimension (NNB)
+*>          The values of the blocksize NB.
+*> \endverbatim
+*>
+*> \param[in] NXVAL
+*> \verbatim
+*>          NXVAL is INTEGER array, dimension (NNB)
+*>          The values of the crossover point NX.
+*> \endverbatim
+*>
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is DOUBLE PRECISION
+*>          The threshold value for the test ratios.  A result is
+*>          included in the output file if RESULT >= THRESH.  To have
+*>          every test ratio printed, use THRESH = 0.
+*> \endverbatim
+*>
+*> \param[out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (MMAX*NMAX)
+*>          where MMAX is the maximum value of M in MVAL and NMAX is the
+*>          maximum value of N in NVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYA
+*> \verbatim
+*>          COPYA is COMPLEX*16 array, dimension (MMAX*NMAX)
+*> \endverbatim
+*>
+*> \param[out] B
+*> \verbatim
+*>          B is COMPLEX*16 array, dimension (MMAX*NSMAX)
+*>          where MMAX is the maximum value of M in MVAL and NSMAX is the
+*>          maximum value of NRHS in NSVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYB
+*> \verbatim
+*>          COPYB is COMPLEX*16 array, dimension (MMAX*NSMAX)
+*> \endverbatim
+*>
+*> \param[out] S
+*> \verbatim
+*>          S is DOUBLE PRECISION array, dimension
+*>                      (min(MMAX,NMAX))
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX*16 array, dimension (MMAX)
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension
+*>                      (max(M*max(M,N) + 4*min(M,N) + max(M,N)))
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (4*NMAX)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (2*NMAX)
+*> \endverbatim
+*>
+*> \param[in] NOUT
+*> \verbatim
+*>          NOUT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16_lin
+*
+*  =====================================================================
+      SUBROUTINE ZCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+     $                      B, COPYB, S, TAU,
+     $                      WORK, RWORK, IWORK, NOUT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            NM, NN, NNB, NNS, NOUT
+      DOUBLE PRECISION   THRESH
+*     ..
+*     .. Array Arguments ..
+      LOGICAL            DOTYPE( * )
+      INTEGER            IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ),
+     $                   NSVAL( * ), NXVAL( * )
+      DOUBLE PRECISION   S( * ), RWORK( * )
+      COMPLEX*16         A( * ), COPYA( * ), B( * ), COPYB( * ),
+     $                   TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NTYPES
+      PARAMETER          ( NTYPES = 19 )
+      INTEGER            NTESTS
+      PARAMETER          ( NTESTS = 5 )
+      DOUBLE PRECISION   ONE, ZERO, BIGNUM
+      COMPLEX*16         CONE, CZERO
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0,
+     $                   CZERO = ( 0.0D+0, 0.0D+0 ),
+     $                   CONE = ( 1.0D+0, 0.0D+0 ),
+     $                   BIGNUM = 1.0D+38 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          DIST, TYPE
+      CHARACTER*3        PATH
+      INTEGER            I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO,
+     $                   INB, IND_OFFSET_GEN,
+     $                   IND_IN, IND_OUT, INS, INFO,
+     $                   ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO,
+     $                   KFACT, KL, KMAX, KU, LDA, LW, LWORK,
+     $                   LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N,
+     $                   NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS,
+     $                   NRUN, NX, T
+      DOUBLE PRECISION   ANORM, CNDNUM, EPS, ABSTOL, RELTOL,
+     $                   DTEMP, MAXC2NRMK, RELMAXC2NRMK
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 ), ISEEDY( 4 )
+      DOUBLE PRECISION   RESULT( NTESTS ), RDUMMY( 1 )
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH, ZQPT01, ZQRT11, ZQRT12, ZLANGE
+      EXTERNAL           DLAMCH, ZQPT01, ZQRT11, ZQRT12, ZLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAERH, ALAHD, ALASUM, DLAORD, ICOPY, ZAXPY,
+     $                   XLAENV, ZGEQP3RK, ZLACPY, ZLASET, ZLATB4,
+     $                   ZLATMS, ZUNMQR, ZSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, MAX, MIN, MOD
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER*32       SRNAMT
+      INTEGER            INFOT, IOUNIT, ZUNMQR_LWORK
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, IOUNIT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA               ISEEDY / 1988, 1989, 1990, 1991 /
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize constants and the random number seed.
+*
+      PATH( 1: 1 ) = 'Zomplex precision'
+      PATH( 2: 3 ) = 'QK'
+      NRUN = 0
+      NFAIL = 0
+      NERRS = 0
+      DO  I = 1, 4
+         ISEED( I ) = ISEEDY( I )
+      END DO
+      EPS = DLAMCH( 'Epsilon' )
+      INFOT = 0
+*
+      DO IM = 1, NM
+*
+*        Do for each value of M in MVAL.
+*
+         M = MVAL( IM )
+         LDA = MAX( 1, M )
+*
+         DO IN = 1, NN
+*
+*           Do for each value of N in NVAL.
+*
+            N = NVAL( IN )
+            MINMN = MIN( M, N )
+            LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ),
+     $                   M*N + 2*MINMN + 4*N )
+*
+            DO INS = 1, NNS
+               NRHS = NSVAL( INS )
+*
+*                 Set up parameters with ZLATB4 and generate
+*                 M-by-NRHS B matrix with ZLATMS.
+*                 IMAT = 14:
+*                 Random matrix, CNDNUM = 2, NORM = ONE,
+*                 MODE = 3 (geometric distribution of singular values).
+*
+                  CALL ZLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'ZLATMS'
+                  CALL ZLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYB, LDA, WORK, INFO )
+*
+*                 Check error code from ZLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'ZLATMS', INFO, 0, ' ', M,
+     $                            NRHS, -1, -1, -1, 6, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+               DO IMAT = 1, NTYPES
+*
+*              Do the tests only if DOTYPE( IMAT ) is true.
+*
+               IF( .NOT.DOTYPE( IMAT ) )
+     $            CYCLE
+*
+*              The type of distribution used to generate the random
+*              eigen-/singular values:
+*              ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 )
+*
+*           Do for each type of NON-SYMMETRIC matrix:                               CNDNUM                     NORM                                     MODE
+*            1. Zero matrix
+*            2. Random, Diagonal, CNDNUM = 2                                        CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            3. Random, Upper triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            4. Random, Lower triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            5. Random, First column is zero, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            6. Random, Last MINMN column is zero, CNDNUM = 2                       CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            7. Random, Last N column is zero, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            8. Random, Middle column in MINMN is zero, CNDNUM = 2                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            9. Random, First half of MINMN columns are zero, CNDNUM = 2            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2   CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           11. Random, Half MINMN columns in the middle are zero starting
+*                  from  MINMN/2-(MINMN/2)/2+1, CNDNUM = 2                          CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           12. Random, Odd columns are ZERO, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           13. Random, Even columns are ZERO, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           14. Random, CNDNUM = 2                                                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           15. Random, CNDNUM = sqrt(0.1/EPS)                                      CNDNUM = BADC1 = sqrt(0.1/EPS)  ONE                                      3 ( geometric distribution of singular values )
+*           16. Random, CNDNUM = 0.1/EPS                                            CNDNUM = BADC2 = 0.1/EPS        ONE                                      3 ( geometric distribution of singular values )
+*           17. Random, CNDNUM = 0.1/EPS,                                           CNDNUM = BADC2 = 0.1/EPS        ONE                                      2 ( one small singular value, S(N)=1/CNDNUM )
+*                 one small singular value S(N)=1/CNDNUM
+*           18. Random, CNDNUM = 2, scaled near underflow                           CNDNUM = 2                      SMALL = SAFMIN
+*           19. Random, CNDNUM = 2, scaled near overflow                            CNDNUM = 2                      LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) )  3 ( geometric distribution of singular values )
+*
+               IF( IMAT.EQ.1 ) THEN
+*
+*                 Matrix 1: Zero matrix
+*
+                  CALL ZLASET( 'Full', M, N, CZERO, CZERO, COPYA, LDA )
+                  DO I = 1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 )
+     $                  .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN
+*
+*                 Matrices 2-5.
+*
+*                 Set up parameters with DLATB4 and generate a test
+*                 matrix with ZLATMS.
+*
+                  CALL ZLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'ZLATMS'
+                  CALL ZLATMS( M, N, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYA, LDA, WORK, INFO )
+*
+*                 Check error code from ZLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'ZLATMS', INFO, 0, ' ', M, N,
+     $                            -1, -1, -1, IMAT, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+                  CALL DLAORD( 'Decreasing', MINMN, S, 1 )
+*
+               ELSE IF( MINMN.GE.2
+     $                  .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN
+*
+*                 Rectangular matrices 5-13 that contain zero columns,
+*                 only for matrices MINMN >=2.
+*
+*                 JB_ZERO is the column index of ZERO block.
+*                 NB_ZERO is the column block size of ZERO block.
+*                 NB_GEN is the column blcok size of the
+*                 generated block.
+*                 J_INC in the non_zero column index increment
+*                 for matrix 12 and 13.
+*                 J_FIRS_NZ is the index of the first non-zero
+*                 column.
+*
+                  IF( IMAT.EQ.5 ) THEN
+*
+*                    First column is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.6 ) THEN
+*
+*                    Last column MINMN is zero.
+*
+                     JB_ZERO = MINMN
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.7 ) THEN
+*
+*                    Last column N is zero.
+*
+                     JB_ZERO = N
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.8 ) THEN
+*
+*                    Middle column in MINMN is zero.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.9 ) THEN
+*
+*                    First half of MINMN columns is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.10 ) THEN
+*
+*                    Last columns are zero columns,
+*                    starting from (MINMN / 2 + 1) column.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = N - JB_ZERO + 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.11 ) THEN
+*
+*                    Half of the columns in the middle of MINMN
+*                    columns is zero, starting from
+*                    MINMN/2 - (MINMN/2)/2 + 1 column.
+*
+                     JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.12 ) THEN
+*
+*                    Odd-numbered columns are zero,
+*
+                     NB_GEN = N / 2
+                     NB_ZERO = N - NB_GEN
+                     J_INC = 2
+                     J_FIRST_NZ = 2
+*
+                  ELSE IF( IMAT.EQ.13 ) THEN
+*
+*                    Even-numbered columns are zero.
+*
+                     NB_ZERO = N / 2
+                     NB_GEN = N - NB_ZERO
+                     J_INC = 2
+                     J_FIRST_NZ = 1
+*
+                  END IF
+*
+*
+*                 1) Set the first NB_ZERO columns in COPYA(1:M,1:N)
+*                    to zero.
+*
+                  CALL ZLASET( 'Full', M, NB_ZERO, CZERO, CZERO,
+     $                         COPYA, LDA )
+*
+*                    2) Generate an M-by-(N-NB_ZERO) matrix with the
+*                       chosen singular value distribution
+*                       in COPYA(1:M,NB_ZERO+1:N).
+*
+                  CALL ZLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU,
+     $                         ANORM, MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'ZLATMS'
+*
+                  IND_OFFSET_GEN = NB_ZERO * LDA
+*
+                  CALL ZLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE,
+     $                        CNDNUM, ANORM, KL, KU, 'No packing',
+     $                        COPYA( IND_OFFSET_GEN + 1 ), LDA,
+     $                        WORK, INFO )
+*
+*                 Check error code from ZLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'ZLATMS', INFO, 0, ' ', M,
+     $                            NB_GEN, -1, -1, -1, IMAT, NFAIL,
+     $                            NERRS, NOUT )
+                     CYCLE
+                  END IF
+*
+*                 3) Swap the gererated colums from the right side
+*                 NB_GEN-size block in COPYA into correct column
+*                 positions.
+*
+                  IF( IMAT.EQ.6
+     $                    .OR. IMAT.EQ.7
+     $                    .OR. IMAT.EQ.8
+     $                    .OR. IMAT.EQ.10
+     $                    .OR. IMAT.EQ.11 ) THEN
+*
+*                    Move by swapping the generated columns
+*                    from the right NB_GEN-size block from
+*                    (NB_ZERO+1:NB_ZERO+JB_ZERO)
+*                    into columns (1:JB_ZERO-1).
+*
+                     DO J = 1, JB_ZERO-1, 1
+                        CALL ZSWAP( M,
+     $                        COPYA( ( NB_ZERO+J-1)*LDA+1), 1,
+     $                        COPYA( (J-1)*LDA + 1 ), 1 )
+                     END DO
+*
+                  ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN
+*
+*                    ( IMAT = 12, Odd-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the even zero colums in the
+*                    left NB_ZERO-size block.
+*
+*                    ( IMAT = 13, Even-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the odd zero colums in the
+*                    left NB_ZERO-size block.
+*
+                     DO J = 1, NB_GEN, 1
+                        IND_OUT = ( NB_ZERO+J-1 )*LDA + 1
+                        IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA
+     $                            + 1
+                        CALL ZSWAP( M,
+     $                              COPYA( IND_OUT ), 1,
+     $                              COPYA( IND_IN), 1 )
+                        END DO
+*
+                  END IF
+*
+*                 5) Order the singular values generated by
+*                    DLAMTS in decreasing order and add trailing zeros
+*                    that correspond to zero columns.
+*                    The total number of singular values is MINMN.
+*
+                  MINMNB_GEN = MIN( M, NB_GEN )
+*
+                  CALL DLAORD( 'Decreasing', MINMNB_GEN, S, 1 )
+
+                  DO I = MINMNB_GEN+1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE
+*
+*                    IF(MINMN.LT.2) skip this size for this matrix type.
+*
+                     CYCLE
+               END IF
+*
+*              Initialize a copy array for a pivot array for DGEQP3RK.
+*
+               DO I = 1, N
+                  IWORK( I ) = 0
+               END DO
+*
+               DO INB = 1, NNB
+*
+*                 Do for each pair of values (NB,NX) in NBVAL and NXVAL.
+*
+                  NB = NBVAL( INB )
+                  CALL XLAENV( 1, NB )
+                  NX = NXVAL( INB )
+                  CALL XLAENV( 3, NX )
+*
+*                 We do MIN(M,N)+1 because we need a test for KMAX > N,
+*                 when KMAX is larger than MIN(M,N), KMAX should be
+*                 KMAX = MIN(M,N)
+*
+                  DO KMAX = 0, MIN(M,N)+1
+*
+*                 Get a working copy of COPYA into A( 1:M,1:N ).
+*                 Get a working copy of COPYB into A( 1:M, (N+1):NRHS ).
+*                 Get a working copy of COPYB into into B( 1:M, 1:NRHS ).
+*                 Get a working copy of IWORK(1:N) awith zeroes into
+*                 which is going to be used as pivot array IWORK( N+1:2N ).
+*                 NOTE: IWORK(2N+1:3N) is going to be used as a WORK array
+*                 for the routine.
+*
+                  CALL ZLACPY( 'All', M, N, COPYA, LDA, A, LDA )
+                  CALL ZLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         A( LDA*N + 1 ),  LDA )
+                  CALL ZLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         B,  LDA )
+                  CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 )
+*
+                  ABSTOL = -1.0
+                  RELTOl = -1.0
+*
+*                 Compute the QR factorization with pivoting of A
+*
+                  LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ),
+     $                              3*N + NRHS - 1 ) )
+*
+*                 Compute ZGEQP3RK factorization of A.
+*
+                  SRNAMT = 'ZGEQP3RK'
+                  CALL ZGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                           A, LDA, KFACT, MAXC2NRMK,
+     $                           RELMAXC2NRMK, IWORK( N+1 ), TAU,
+     $                           WORK, LW, RWORK, IWORK( 2*N+1 ),
+     $                           INFO )
+*
+*                 Check error code from ZGEQP3RK.
+*
+                  IF( INFO.LT.0 )
+     $               CALL ALAERH( PATH, 'ZGEQP3RK', INFO, 0, ' ',
+     $                            M, N, NX, -1, NB, IMAT,
+     $                            NFAIL, NERRS, NOUT )
+*
+                  IF( KFACT.EQ.MINMN ) THEN
+*
+*                 Compute test 1:
+*
+*                 This test in only for the full rank factorization of
+*                 the matrix A.
+*
+*                 Array S(1:min(M,N)) contains svd(A) the sigular values
+*                 of the original matrix A in decreasing absolute value
+*                 order. The test computes svd(R), the vector sigular
+*                 values of the upper trapezoid of A(1:M,1:N) that
+*                 contains the factor R, in decreasing order. The test
+*                 returns the ratio:
+*
+*                 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS )
+*
+                     RESULT( 1 ) = ZQRT12( M, N, A, LDA, S, WORK,
+     $                                     LWORK , RWORK )
+*
+                     DO T = 1, 1
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL, NB, NX,
+     $                        IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                   End test 1
+*
+                  END IF
+
+*                 Compute test 2:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS )
+*
+                  RESULT( 2 ) = ZQPT01( M, N, KFACT, COPYA, A, LDA, TAU,
+     $                          IWORK( N+1 ), WORK, LWORK )
+*
+*                 Compute test 3:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( Q**T * Q - I ) / ( M * EPS )
+*
+                  RESULT( 3 ) = ZQRT11( M, KFACT, A, LDA, TAU, WORK,
+     $                          LWORK )
+*
+*                 Print information about the tests that did not pass
+*                 the threshold.
+*
+                  DO T = 2, 3
+                     IF( RESULT( T ).GE.THRESH ) THEN
+                        IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                     CALL ALAHD( NOUT, PATH )
+                        WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', M, N,
+     $                      NRHS, KMAX, ABSTOL, RELTOL,
+     $                      NB, NX, IMAT, T, RESULT( T )
+                        NFAIL = NFAIL + 1
+                     END IF
+                  END DO
+                  NRUN = NRUN + 2
+*
+*                 Compute test 4:
+*
+*                 This test is only for the factorizations with the
+*                 rank greater than 2.
+*                 The elements on the diagonal of R should be non-
+*                 increasing.
+*
+*                 The test returns the ratio:
+*
+*                 Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)),
+*                 K=1:KFACT-1
+*
+                  IF( MIN(KFACT, MINMN).GE.2 ) THEN
+*
+                     DO J = 1, KFACT-1, 1
+*
+                        DTEMP = (( ABS( A( (J-1)*M+J ) ) -
+     $                          ABS( A( (J)*M+J+1 ) ) ) /
+     $                          ABS( A(1) ) )
+*
+                        IF( DTEMP.LT.ZERO ) THEN
+                           RESULT( 4 ) = BIGNUM
+                        END IF
+*
+                     END DO
+*
+*                    Print information about the tests that did not
+*                    pass the threshold.
+*
+                     DO T = 4, 4
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK',
+     $                        M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T,
+     $                        RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End test 4.
+*
+                  END IF
+*
+*                 Compute test 5:
+*
+*                 This test in only for matrix A with min(M,N) > 0.
+*
+*                 The test returns the ratio:
+*
+*                 1-norm(Q**T * B - Q**T * B ) /
+*                       ( M * EPS )
+*
+*                 (1) Compute B:=Q**T * B in the matrix B.
+*
+                  IF( MINMN.GT.0 ) THEN
+*
+                     LWORK_MQR = MAX(1, NRHS)
+                     CALL ZUNMQR( 'Left', 'Conjugate transpose',
+     $                         M, NRHS, KFACT, A, LDA, TAU, B, LDA,
+     $                         WORK, LWORK_MQR, INFO )
+*
+                     DO I = 1, NRHS
+*
+*                       Compare N+J-th column of A and J-column of B.
+*
+                        CALL ZAXPY( M, -CONE, A( ( N+I-1 )*LDA+1 ), 1,
+     $                                    B( ( I-1 )*LDA+1 ), 1 )
+                     END DO
+*
+                     RESULT( 5 ) =
+     $               ABS(
+     $               ZLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) /
+     $               ( DBLE( M )*DLAMCH( 'Epsilon' ) )
+     $               )
+*
+*                    Print information about the tests that did not pass
+*                    the threshold.
+*
+                     DO T = 5, 5
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End compute test 5.
+*
+                  END IF
+*
+*                 END DO KMAX = 1, MIN(M,N)+1
+*
+                  END DO
+*
+*                 END DO for INB = 1, NNB
+*
+               END DO
+*
+*              END DO  for IMAT = 1, NTYPES
+*
+               END DO
+*
+*              END DO for INS = 1, NNS
+*
+            END DO
+*
+*           END DO for IN = 1, NN
+*
+         END DO
+*
+*        END DO for IM = 1, NM
+*
+      END DO
+*
+*     Print a summary of the results.
+*
+      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
+*
+ 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5,
+     $        ', KMAX =', I5, ', ABSTOL =', G12.5,
+     $        ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4,
+     $        ', type ', I2, ', test ', I2, ', ratio =', G12.5 )
+*
+*     End of ZCHKQP3RK
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f
index 9401867e0..fcd774491 100644
--- a/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f
@@ -400,9 +400,9 @@
 *                    Factor the matrix and solve the system using ZHESV_AA.
 *
                      SRNAMT = 'ZHESV_AA_2STAGE '
-                     LWORK = MIN(N*NB, 3*NMAX*NMAX)
+                     LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX )
                      CALL ZHESV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
-     $                                 AINV, (3*NB+1)*N, 
+     $                                 AINV, MAX( 1, (3*NB+1)*N ),
      $                                 IWORK, IWORK( 1+N ),
      $                                 X, LDA, WORK, LWORK, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/zlatb4.f b/lapack-netlib/TESTING/LIN/zlatb4.f
index a6977f4e9..a2b19f83d 100644
--- a/lapack-netlib/TESTING/LIN/zlatb4.f
+++ b/lapack-netlib/TESTING/LIN/zlatb4.f
@@ -154,9 +154,6 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           DLABAD
-*     ..
 *     .. Save statement ..
       SAVE               EPS, SMALL, LARGE, BADC1, BADC2, FIRST
 *     ..
@@ -174,11 +171,6 @@
          BADC1 = SQRT( BADC2 )
          SMALL = DLAMCH( 'Safe minimum' )
          LARGE = ONE / SMALL
-*
-*        If it looks like we're on a Cray, take the square root of
-*        SMALL and LARGE to avoid overflow and underflow problems.
-*
-         CALL DLABAD( SMALL, LARGE )
          SMALL = SHRINK*( SMALL / EPS )
          LARGE = ONE / SMALL
       END IF
@@ -233,6 +225,110 @@
          ELSE
             ANORM = ONE
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        xQK: truncated QR with pivoting.
+*             Set parameters to generate a general
+*             M x N matrix.
+*
+*        Set TYPE, the type of matrix to be generated.  'N' is nonsymmetric.
+*
+         TYPE = 'N'
+*
+*        Set DIST, the type of distribution for the random
+*        number generator. 'S' is
+*
+         DIST = 'S'
+*
+*        Set the lower and upper bandwidths.
+*
+         IF( IMAT.EQ.2 ) THEN
+*
+*           2. Random, Diagonal, CNDNUM = 2
+*
+            KL = 0
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.3 ) THEN
+*
+*           3. Random, Upper triangular,  CNDNUM = 2
+*
+            KL = 0
+            KU = MAX( N-1, 0 )
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.4 ) THEN
+*
+*          4. Random, Lower triangular,  CNDNUM = 2
+*
+            KL = MAX( M-1, 0 )
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE
+*
+*           5.-19. Rectangular matrix
+*
+            KL = MAX( M-1, 0 )
+            KU = MAX( N-1, 0 )
+*
+            IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN
+*
+*              5.-14. Random, CNDNUM = 2.
+*
+               CNDNUM = TWO
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.15 ) THEN
+*
+*              15. Random, CNDNUM = sqrt(0.1/EPS)
+*
+               CNDNUM = BADC1
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.16 ) THEN
+*
+*              16. Random, CNDNUM = 0.1/EPS
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.17 ) THEN
+*
+*              17. Random, CNDNUM = 0.1/EPS,
+*                  one small singular value S(N)=1/CNDNUM
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 2
+*
+            ELSE IF( IMAT.EQ.18 ) THEN
+*
+*              18. Random, scaled near underflow
+*
+               CNDNUM = TWO
+               ANORM = SMALL
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.19 ) THEN
+*
+*              19. Random, scaled near overflow
+*
+               CNDNUM = TWO
+               ANORM = LARGE
+               MODE = 3
+*
+            END IF
+*
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN
 *
@@ -517,17 +613,18 @@
 *
 *        Set the norm and condition number.
 *
-         IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN
+         MAT = ABS( IMAT )
+         IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN
             CNDNUM = BADC1
-         ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN
+         ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN
             CNDNUM = BADC2
          ELSE
             CNDNUM = TWO
          END IF
 *
-         IF( IMAT.EQ.4 ) THEN
+         IF( MAT.EQ.4 ) THEN
             ANORM = SMALL
-         ELSE IF( IMAT.EQ.5 ) THEN
+         ELSE IF( MAT.EQ.5 ) THEN
             ANORM = LARGE
          ELSE
             ANORM = ONE
diff --git a/lapack-netlib/TESTING/LIN/zqpt01.f b/lapack-netlib/TESTING/LIN/zqpt01.f
index 4e53f92c8..c69eb658f 100644
--- a/lapack-netlib/TESTING/LIN/zqpt01.f
+++ b/lapack-netlib/TESTING/LIN/zqpt01.f
@@ -33,7 +33,7 @@
 *> Householder vectors, and the rest of AF contains a partially updated
 *> matrix.
 *>
-*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M)
+*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) )
 *> \endverbatim
 *
 *  Arguments:
@@ -172,28 +172,28 @@
 *
       NORMA = ZLANGE( 'One-norm', M, N, A, LDA, RWORK )
 *
-      DO 30 J = 1, K
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, K
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = AF( I, J )
-   10    CONTINUE
-         DO 20 I = J + 1, M
+         END DO
+         DO I = J + 1, M
             WORK( ( J-1 )*M+I ) = ZERO
-   20    CONTINUE
-   30 CONTINUE
-      DO 40 J = K + 1, N
+         END DO
+      END DO
+      DO J = K + 1, N
          CALL ZCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 )
-   40 CONTINUE
+      END DO
 *
       CALL ZUNMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK,
      $             M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
-      DO 50 J = 1, N
+      DO J = 1, N
 *
 *        Compare i-th column of QR and jpvt(i)-th column of A
 *
          CALL ZAXPY( M, DCMPLX( -ONE ), A( 1, JPVT( J ) ), 1,
      $               WORK( ( J-1 )*M+1 ), 1 )
-   50 CONTINUE
+      END DO
 *
       ZQPT01 = ZLANGE( 'One-norm', M, N, WORK, M, RWORK ) /
      $         ( DBLE( MAX( M, N ) )*DLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/zqrt11.f b/lapack-netlib/TESTING/LIN/zqrt11.f
index c3be59c36..dc4af744f 100644
--- a/lapack-netlib/TESTING/LIN/zqrt11.f
+++ b/lapack-netlib/TESTING/LIN/zqrt11.f
@@ -158,9 +158,9 @@
       CALL ZUNM2R( 'Left', 'Conjugate transpose', M, M, K, A, LDA, TAU,
      $             WORK, M, WORK( M*M+1 ), INFO )
 *
-      DO 10 J = 1, M
+      DO J = 1, M
          WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE
-   10 CONTINUE
+      END DO
 *
       ZQRT11 = ZLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) /
      $         ( DBLE( M )*DLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/zqrt12.f b/lapack-netlib/TESTING/LIN/zqrt12.f
index 0da6be157..91477b5ea 100644
--- a/lapack-netlib/TESTING/LIN/zqrt12.f
+++ b/lapack-netlib/TESTING/LIN/zqrt12.f
@@ -28,7 +28,7 @@
 *> ZQRT12 computes the singular values `svlues' of the upper trapezoid
 *> of A(1:M,1:N) and returns the ratio
 *>
-*>      || s - svlues||/(||svlues||*eps*max(M,N))
+*>      || svlues - s||/(||s||*eps*max(M,N))
 *> \endverbatim
 *
 *  Arguments:
@@ -125,8 +125,8 @@
       EXTERNAL           DASUM, DLAMCH, DNRM2, ZLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DAXPY, DBDSQR, DLABAD, DLASCL, XERBLA, ZGEBD2,
-     $                   ZLASCL, ZLASET
+      EXTERNAL           DAXPY, DBDSQR, DLASCL, XERBLA, ZGEBD2, ZLASCL,
+     $                   ZLASET
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          DBLE, DCMPLX, MAX, MIN
@@ -154,17 +154,16 @@
 *
       CALL ZLASET( 'Full', M, N, DCMPLX( ZERO ), DCMPLX( ZERO ), WORK,
      $             M )
-      DO 20 J = 1, N
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, N
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = A( I, J )
-   10    CONTINUE
-   20 CONTINUE
+         END DO
+      END DO
 *
 *     Get machine parameters
 *
       SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale work if max entry outside range [SMLNUM,BIGNUM]
 *
@@ -208,9 +207,9 @@
 *
       ELSE
 *
-         DO 30 I = 1, MN
+         DO I = 1, MN
             RWORK( I ) = ZERO
-   30    CONTINUE
+         END DO
       END IF
 *
 *     Compare s and singular values of work
@@ -218,6 +217,7 @@
       CALL DAXPY( MN, -ONE, S, 1, RWORK( 1 ), 1 )
       ZQRT12 = DASUM( MN, RWORK( 1 ), 1 ) /
      $         ( DLAMCH( 'Epsilon' )*DBLE( MAX( M, N ) ) )
+*
       IF( NRMSVL.NE.ZERO )
      $   ZQRT12 = ZQRT12 / NRMSVL
 *
diff --git a/lapack-netlib/TESTING/Makefile b/lapack-netlib/TESTING/Makefile
index bdea2bfaa..3963260ac 100644
--- a/lapack-netlib/TESTING/Makefile
+++ b/lapack-netlib/TESTING/Makefile
@@ -61,6 +61,8 @@ SEIGTST= snep.out \
          scsd.out \
          slse.out
 
+SDMDEIGTST= sdmd.out
+
 CEIGTST= cnep.out \
          csep.out \
          cse2.out \
@@ -82,6 +84,8 @@ CEIGTST= cnep.out \
          ccsd.out \
          clse.out
 
+CDMDEIGTST= cdmd.out
+
 DEIGTST= dnep.out \
          dsep.out \
          dse2.out \
@@ -103,6 +107,8 @@ DEIGTST= dnep.out \
          dcsd.out \
          dlse.out
 
+DDMDEIGTST= ddmd.out
+
 ZEIGTST= znep.out \
          zsep.out \
          zse2.out \
@@ -124,6 +130,7 @@ ZEIGTST= znep.out \
          zcsd.out \
          zlse.out
 
+ZDMDEIGTST= zdmd.out
 
 SLINTST= stest.out
 
@@ -142,10 +149,10 @@ ZLINTST= ztest.out
 ZLINTSTPROTO= zctest.out ztest_rfp.out
 
 .PHONY: single complex double complex16
-single:         $(SLINTST) $(SEIGTST)
-complex:        $(CLINTST) $(CEIGTST)
-double:         $(DLINTST) $(DEIGTST)
-complex16:      $(ZLINTST) $(ZEIGTST)
+single:         $(SLINTST) $(SEIGTST) $(SDMDEIGTST)
+complex:        $(CLINTST) $(CEIGTST) $(CDMDEIGTST)
+double:         $(DLINTST) $(DEIGTST) $(DDMDEIGTST)
+complex16:      $(ZLINTST) $(ZEIGTST) $(ZDMDEIGTST)
 
 .PHONY: singleproto complexproto doubleproto complex16proto
 singleproto:    $(SLINTSTPROTO)
@@ -297,6 +304,10 @@ scsd.out: csd.in EIG/xeigtsts
 slse.out: lse.in EIG/xeigtsts
 	@echo LSE: Testing Constrained Linear Least Squares routines
 	./EIG/xeigtsts < lse.in > $@ 2>&1
+
+sdmd.out: sdmd.in EIG/xdmdeigtsts
+	@echo DMD: Testing Dynamic Mode Decomposition routines
+	./EIG/xdmdeigtsts < sdmd.in > $@ 2>&1
 #
 # ======== COMPLEX EIG TESTS ===========================
 
@@ -379,6 +390,10 @@ ccsd.out: csd.in EIG/xeigtstc
 clse.out: lse.in EIG/xeigtstc
 	@echo LSE: Testing Constrained Linear Least Squares routines
 	./EIG/xeigtstc < lse.in > $@ 2>&1
+
+cdmd.out: cdmd.in EIG/xdmdeigtstc
+	@echo DMD: Testing Dynamic Mode Decomposition routines
+	./EIG/xdmdeigtstc < cdmd.in > $@ 2>&1
 #
 # ======== DOUBLE EIG TESTS ===========================
 
@@ -461,6 +476,10 @@ dcsd.out: csd.in EIG/xeigtstd
 dlse.out: lse.in EIG/xeigtstd
 	@echo LSE: Testing Constrained Linear Least Squares routines
 	./EIG/xeigtstd < lse.in > $@ 2>&1
+
+ddmd.out: ddmd.in EIG/xdmdeigtstd
+	@echo DMD: Testing Dynamic Mode Decomposition routines
+	./EIG/xdmdeigtstd < ddmd.in > $@ 2>&1
 #
 # ======== COMPLEX16 EIG TESTS ===========================
 
@@ -543,6 +562,10 @@ zcsd.out: csd.in EIG/xeigtstz
 zlse.out: lse.in EIG/xeigtstz
 	@echo LSE: Testing Constrained Linear Least Squares routines
 	./EIG/xeigtstz < lse.in > $@ 2>&1
+
+zdmd.out: zdmd.in EIG/xdmdeigtstz
+	@echo DMD: Testing Dynamic Mode Decomposition routines
+	./EIG/xdmdeigtstz < zdmd.in > $@ 2>&1
 # ==============================================================================
 
 LIN/xlintsts: $(FRCLIN) $(FRC)
@@ -578,15 +601,27 @@ LIN/xlintstzc: $(FRCLIN) $(FRC)
 EIG/xeigtsts: $(FRCEIG) $(FRC)
 	$(MAKE) -C EIG xeigtsts
 
+EIG/xdmdeigtsts: $(FRCEIG) $(FRC)
+	$(MAKE) -C EIG xdmdeigtsts
+
 EIG/xeigtstc: $(FRCEIG) $(FRC)
 	$(MAKE) -C EIG xeigtstc
 
+EIG/xdmdeigtstc: $(FRCEIG) $(FRC)
+	$(MAKE) -C EIG xdmdeigtstc
+
 EIG/xeigtstd: $(FRCEIG) $(FRC)
 	$(MAKE) -C EIG xeigtstd
 
+EIG/xdmdeigtstd: $(FRCEIG) $(FRC)
+	$(MAKE) -C EIG xdmdeigtstd
+
 EIG/xeigtstz: $(FRCEIG) $(FRC)
 	$(MAKE) -C EIG xeigtstz
 
+EIG/xdmdeigtstz: $(FRCEIG) $(FRC)
+	$(MAKE) -C EIG xdmdeigtstz
+
 .PHONY: clean cleantest
 clean: cleantest
 cleantest:
diff --git a/lapack-netlib/TESTING/cdmd.in b/lapack-netlib/TESTING/cdmd.in
new file mode 100644
index 000000000..42d046e01
--- /dev/null
+++ b/lapack-netlib/TESTING/cdmd.in
@@ -0,0 +1,11 @@
+10
+5
+
+20
+10
+
+30
+11
+
+50
+20
diff --git a/lapack-netlib/TESTING/ctest.in b/lapack-netlib/TESTING/ctest.in
index a3588b4a1..74ff31ab8 100644
--- a/lapack-netlib/TESTING/ctest.in
+++ b/lapack-netlib/TESTING/ctest.in
@@ -42,6 +42,7 @@ CRQ    8               List types on next line if 0 < NTYPES <  8
 CLQ    8               List types on next line if 0 < NTYPES <  8
 CQL    8               List types on next line if 0 < NTYPES <  8
 CQP    6               List types on next line if 0 < NTYPES <  6
+CQK   19               List types on next line if 0 < NTYPES < 19
 CTZ    3               List types on next line if 0 < NTYPES <  3
 CLS    6               List types on next line if 0 < NTYPES <  6
 CEQ
diff --git a/lapack-netlib/TESTING/ddmd.in b/lapack-netlib/TESTING/ddmd.in
new file mode 100644
index 000000000..42d046e01
--- /dev/null
+++ b/lapack-netlib/TESTING/ddmd.in
@@ -0,0 +1,11 @@
+10
+5
+
+20
+10
+
+30
+11
+
+50
+20
diff --git a/lapack-netlib/TESTING/dtest.in b/lapack-netlib/TESTING/dtest.in
index 29bb8b92e..1b6c7bd4a 100644
--- a/lapack-netlib/TESTING/dtest.in
+++ b/lapack-netlib/TESTING/dtest.in
@@ -36,6 +36,7 @@ DRQ    8               List types on next line if 0 < NTYPES <  8
 DLQ    8               List types on next line if 0 < NTYPES <  8
 DQL    8               List types on next line if 0 < NTYPES <  8
 DQP    6               List types on next line if 0 < NTYPES <  6
+DQK   19               LIst types on next line if 0 < NTYPES < 19
 DTZ    3               List types on next line if 0 < NTYPES <  3
 DLS    6               List types on next line if 0 < NTYPES <  6
 DEQ
diff --git a/lapack-netlib/TESTING/sdmd.in b/lapack-netlib/TESTING/sdmd.in
new file mode 100644
index 000000000..42d046e01
--- /dev/null
+++ b/lapack-netlib/TESTING/sdmd.in
@@ -0,0 +1,11 @@
+10
+5
+
+20
+10
+
+30
+11
+
+50
+20
diff --git a/lapack-netlib/TESTING/stest.in b/lapack-netlib/TESTING/stest.in
index 27ac30040..7faa8b7a1 100644
--- a/lapack-netlib/TESTING/stest.in
+++ b/lapack-netlib/TESTING/stest.in
@@ -36,6 +36,7 @@ SRQ    8               List types on next line if 0 < NTYPES <  8
 SLQ    8               List types on next line if 0 < NTYPES <  8
 SQL    8               List types on next line if 0 < NTYPES <  8
 SQP    6               List types on next line if 0 < NTYPES <  6
+SQK   19               List types on next line if 0 < NTYPES < 19
 STZ    3               List types on next line if 0 < NTYPES <  3
 SLS    6               List types on next line if 0 < NTYPES <  6
 SEQ
diff --git a/lapack-netlib/TESTING/zdmd.in b/lapack-netlib/TESTING/zdmd.in
new file mode 100644
index 000000000..42d046e01
--- /dev/null
+++ b/lapack-netlib/TESTING/zdmd.in
@@ -0,0 +1,11 @@
+10
+5
+
+20
+10
+
+30
+11
+
+50
+20
diff --git a/lapack-netlib/TESTING/ztest.in b/lapack-netlib/TESTING/ztest.in
index 58da33d60..c83e82e45 100644
--- a/lapack-netlib/TESTING/ztest.in
+++ b/lapack-netlib/TESTING/ztest.in
@@ -42,6 +42,7 @@ ZRQ    8               List types on next line if 0 < NTYPES <  8
 ZLQ    8               List types on next line if 0 < NTYPES <  8
 ZQL    8               List types on next line if 0 < NTYPES <  8
 ZQP    6               List types on next line if 0 < NTYPES <  6
+ZQK   19               List types on next line if 0 < NTYPES < 19
 ZTZ    3               List types on next line if 0 < NTYPES <  3
 ZLS    6               List types on next line if 0 < NTYPES <  6
 ZEQ
diff --git a/lapack-netlib/lapack_testing.py b/lapack-netlib/lapack_testing.py
index 5582744a0..fea0ff765 100755
--- a/lapack-netlib/lapack_testing.py
+++ b/lapack-netlib/lapack_testing.py
@@ -1,31 +1,29 @@
-#! /usr/bin/env python
-# -*- coding: utf-8 -*-
+#!/usr/bin/env python3
 
 
 ###############################################################################
 # lapack_testing.py
 ###############################################################################
 
-from __future__ import print_function
 from subprocess import Popen, STDOUT, PIPE
 import os, sys, math
 import getopt
 # Arguments
 try:
     opts, args = getopt.getopt(sys.argv[1:], "hd:b:srep:t:n",
-                               ["help", "dir", "bin", "short", "run", "error","prec=","test=","number"])
+                               ["help", "dir=", "bin=", "short", "run", "error","prec=","test=","number"])
 
 except getopt.error as msg:
     print(msg)
     print("for help use --help")
     sys.exit(2)
 
-short_summary=0
-with_file=1
-just_errors = 0
+short_summary = False
+with_file = True
+just_errors = False
 prec='x'
 test='all'
-only_numbers=0
+only_numbers = False
 test_dir='TESTING'
 bin_dir='bin/Release'
 
@@ -34,10 +32,9 @@ for o, a in opts:
         print(sys.argv[0]+" [-h|--help] [-d dir |--dir dir] [-s |--short] [-r |--run] [-e |--error] [-p p |--prec p] [-t test |--test test] [-n | --number]")
         print("     - h is to print this message")
         print("     - r is to use to run the LAPACK tests then analyse the output (.out files). By default, the script will not run all the LAPACK tests")
-        print("     - d [dir] is to indicate where is the LAPACK testing directory (.out files). By default, the script will use .")
-        print("     - b [bin] is to indicate where is the LAPACK binary files are located. By default, the script will use .")
+        print("     - d [dir] indicates the location of the LAPACK testing directory (.out files). By default, the script will use {:s}.".format(test_dir))
+        print("     - b [bin] indicates the location of the LAPACK binary files. By default, the script will use {:s}.".format(bin_dir))
         print(" LEVEL OF OUTPUT")
-        print("     - x is to print a detailed summary")
         print("     - e is to print only the error summary")
         print("     - s is to print a short summary")
         print("     - n is to print the numbers of failing tests (turn on summary mode)")
@@ -63,15 +60,14 @@ for o, a in opts:
         print("            Will return the numbers of failed tests in REAL precision by running the LAPACK Tests then analyzing the output")
         print("     ./lapack_testing.py -n -p s -t eig ")
         print("            Will return the numbers of failed tests in REAL precision by analyzing only the LAPACK output of EIGEN testings")
-        print("Written by Julie Langou (June 2011) ")
         sys.exit(0)
     else:
         if o in ("-s", "--short"):
-            short_summary = 1
+            short_summary = True
         if o in ("-r", "--run"):
-            with_file = 0
+            with_file = False
         if o in ("-e", "--error"):
-            just_errors = 1
+            just_errors = True
         if o in ( '-p', '--prec' ):
             prec = a
         if o in ( '-b', '--bin' ):
@@ -81,12 +77,12 @@ for o, a in opts:
         if o in ( '-t', '--test' ):
             test = a
         if o in ( '-n', '--number' ):
-            only_numbers = 1
-            short_summary = 1
+            only_numbers = True
+            short_summary = True
 
 # process options
 
-abs_bin_dir=os.path.normpath(os.path.join(os.getcwd(),bin_dir))
+abs_bin_dir=os.path.abspath(bin_dir)
 
 os.chdir(test_dir)
 
@@ -108,7 +104,7 @@ def run_summary_test( f, cmdline, short_summary):
     nb_test_illegal=0
     nb_test_info=0
 
-    if (with_file):
+    if with_file:
         if not os.path.exists(cmdline):
             error_message=cmdline+" file not found"
             r=1
@@ -140,21 +136,21 @@ def run_summary_test( f, cmdline, short_summary):
         for line in pipe.readlines():
             f.write(str(line))
             words_in_line=line.split()
-            if (line.find("run")!=-1):
+            if (line.find("run)")!=-1):
 #                  print line
                 whereisrun=words_in_line.index("run)")
                 nb_test_run+=int(words_in_line[whereisrun-2])
             if (line.find("out of")!=-1):
-                if (short_summary==0): print(line, end=' ')
+                if not short_summary: print(line, end=' ')
                 whereisout= words_in_line.index("out")
                 nb_test_fail+=int(words_in_line[whereisout-1])
             if ((line.find("illegal")!=-1) or (line.find("Illegal")!=-1)):
-                if (short_summary==0):print(line, end=' ')
+                if not short_summary: print(line, end=' ')
                 nb_test_illegal+=1
             if (line.find(" INFO")!=-1):
-                if (short_summary==0):print(line, end=' ')
+                if not short_summary: print(line, end=' ')
                 nb_test_info+=1
-            if (with_file==1):
+            if with_file:
                 pipe.close()
 
     f.flush();
@@ -169,7 +165,7 @@ try:
 except IOError:
     f = sys.stdout
 
-if (short_summary==0):
+if not short_summary:
     print(" ")
     print("---------------- Testing LAPACK Routines ----------------")
     print(" ")
@@ -203,6 +199,8 @@ elif test=='mixed':
     range_prec=[1,3]
 elif test=='rfp':
     range_test=[18]
+elif test=='dmd':
+    range_test=[20]
 elif test=='eig':
     range_test=list(range(16))
 else:
@@ -219,7 +217,7 @@ for dtype in range_prec:
     letter = dtypes[0][dtype]
     name = dtypes[1][dtype]
 
-    if (short_summary==0):
+    if not short_summary:
         print(" ")
         print("------------------------- %s ------------------------" % name)
         print(" ")
@@ -231,19 +229,19 @@ for dtype in range_prec:
     letter+"gd",letter+"sb",letter+"sg",
     letter+"bb","glm","gqr",
     "gsv","csd","lse",
-    letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp"),
+    letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp",letter+"dmd"),
     ("Nonsymmetric-Eigenvalue-Problem", "Symmetric-Eigenvalue-Problem", "Symmetric-Eigenvalue-Problem-2-stage", "Singular-Value-Decomposition",
     "Eigen-Condition","Nonsymmetric-Eigenvalue","Nonsymmetric-Generalized-Eigenvalue-Problem",
     "Nonsymmetric-Generalized-Eigenvalue-Problem-driver", "Symmetric-Eigenvalue-Problem", "Symmetric-Eigenvalue-Generalized-Problem",
     "Banded-Singular-Value-Decomposition-routines", "Generalized-Linear-Regression-Model-routines", "Generalized-QR-and-RQ-factorization-routines",
     "Generalized-Singular-Value-Decomposition-routines", "CS-Decomposition-routines", "Constrained-Linear-Least-Squares-routines",
-    "Linear-Equation-routines", "Mixed-Precision-linear-equation-routines","RFP-linear-equation-routines"),
+    "Linear-Equation-routines", "Mixed-Precision-linear-equation-routines","RFP-linear-equation-routines","Dynamic-Mode-Decomposition"),
     (letter+"nep", letter+"sep", letter+"se2", letter+"svd",
     letter+"ec",letter+"ed",letter+"gg",
     letter+"gd",letter+"sb",letter+"sg",
     letter+"bb",letter+"glm",letter+"gqr",
     letter+"gsv",letter+"csd",letter+"lse",
-    letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp"),
+    letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp",letter+"dmd"),
     )
 
 
@@ -252,7 +250,7 @@ for dtype in range_prec:
         # NEED TO SKIP SOME PRECISION (namely s and c) FOR PROTO MIXED PRECISION TESTING
         if dtest==17 and (letter=="s" or letter=="c"):
             continue
-        if (with_file==1):
+        if with_file:
             cmdbase=dtests[2][dtest]+".out"
         else:
             if dtest==16:
@@ -264,10 +262,13 @@ for dtype in range_prec:
             elif dtest==18:
                 # PROTO LIN TESTS
                 cmdbase="LIN/xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
+            elif dtest==20:
+                # DMD EIG TESTS
+                cmdbase="EIG/xdmdeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
             else:
                 # EIG TESTS
                 cmdbase="EIG/xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
-        if (not just_errors and not short_summary):
+        if not just_errors and not short_summary:
             print("Testing "+name+" "+dtests[1][dtest]+"-"+cmdbase, end=' ')
         # Run the process: either to read the file or run the LAPACK testing
         nb_test = run_summary_test(f, cmdbase, short_summary)
@@ -277,19 +278,19 @@ for dtype in range_prec:
         list_results[3][dtype]+=nb_test[3]
         got_error=nb_test[1]+nb_test[2]+nb_test[3]
 
-        if (not short_summary):
-            if (nb_test[0]>0 and just_errors==0):
+        if not short_summary:
+            if nb_test[0] > 0 and not just_errors:
                 print("passed: "+str(nb_test[0]))
-            if (nb_test[1]>0):
+            if nb_test[1] > 0:
                 print("failing to pass the threshold: "+str(nb_test[1]))
-            if (nb_test[2]>0):
+            if nb_test[2] > 0:
                 print("Illegal Error: "+str(nb_test[2]))
-            if (nb_test[3]>0):
+            if nb_test[3] > 0:
                 print("Info Error: "+str(nb_test[3]))
-            if (got_error>0 and just_errors==1):
+            if got_error > 0 and just_errors:
                 print("ERROR IS LOCATED IN "+name+" "+dtests[1][dtest]+" [ "+cmdbase+" ]")
                 print("")
-            if (just_errors==0):
+            if not just_errors:
                 print("")
 #     elif (got_error>0):
 #        print dtests[2][dtest]+".out \t"+str(nb_test[1])+"\t"+str(nb_test[2])+"\t"+str(nb_test[3])
@@ -307,7 +308,7 @@ for dtype in range_prec:
     list_results[2][4]+=list_results[2][dtype]
     list_results[3][4]+=list_results[3][dtype]
 
-if only_numbers==1:
+if only_numbers:
     print(str(list_results[1][4])+"\n"+str(list_results[2][4]+list_results[3][4]))
 else:
     print(summary)
diff --git a/param.h b/param.h
index ee4640f57..469c38ce3 100644
--- a/param.h
+++ b/param.h
@@ -2853,13 +2853,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #define QGEMM_DEFAULT_UNROLL_N 2
-#define CGEMM_DEFAULT_UNROLL_N 4
-#define ZGEMM_DEFAULT_UNROLL_N 4
+#define CGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_N 2
 #define XGEMM_DEFAULT_UNROLL_N 1
 
 #define QGEMM_DEFAULT_UNROLL_M 2
-#define CGEMM_DEFAULT_UNROLL_M 1
-#define ZGEMM_DEFAULT_UNROLL_M 1
+#define CGEMM_DEFAULT_UNROLL_M 2
+#define ZGEMM_DEFAULT_UNROLL_M 2
 #define XGEMM_DEFAULT_UNROLL_M 1
 
 #define SGEMM_DEFAULT_P 256
@@ -2888,11 +2888,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_UNROLL_M  2
 #define SGEMM_DEFAULT_UNROLL_N  8
 
-#define DGEMM_DEFAULT_UNROLL_M  2
-#define DGEMM_DEFAULT_UNROLL_N  8
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
 
-#define CGEMM_DEFAULT_UNROLL_M  1
-#define CGEMM_DEFAULT_UNROLL_N  4
+#define CGEMM_DEFAULT_UNROLL_M  2
+#define CGEMM_DEFAULT_UNROLL_N  2
 
 #define ZGEMM_DEFAULT_UNROLL_M  1
 #define ZGEMM_DEFAULT_UNROLL_N  4
@@ -3359,13 +3359,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define ZGEMM_DEFAULT_UNROLL_M  4
 #define ZGEMM_DEFAULT_UNROLL_N  4
 
-#define SGEMM_DEFAULT_P	128
-#define DGEMM_DEFAULT_P	160
+#define SGEMM_DEFAULT_P	240
+#define DGEMM_DEFAULT_P	240
 #define CGEMM_DEFAULT_P 128
 #define ZGEMM_DEFAULT_P 128
 
-#define SGEMM_DEFAULT_Q 352
-#define DGEMM_DEFAULT_Q 128
+#define SGEMM_DEFAULT_Q 640
+#define DGEMM_DEFAULT_Q 320
 #define CGEMM_DEFAULT_Q 224
 #define ZGEMM_DEFAULT_Q 112
 
@@ -3396,13 +3396,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define ZGEMM_DEFAULT_UNROLL_N  4
 #define ZGEMM_DEFAULT_UNROLL_MN 16
 
-#define SGEMM_DEFAULT_P 128
-#define DGEMM_DEFAULT_P 160
+#define SGEMM_DEFAULT_P 240
+#define DGEMM_DEFAULT_P 240
 #define CGEMM_DEFAULT_P 128
 #define ZGEMM_DEFAULT_P 128
 
-#define SGEMM_DEFAULT_Q 352
-#define DGEMM_DEFAULT_Q 128
+#define SGEMM_DEFAULT_Q 640
+#define DGEMM_DEFAULT_Q 320
 #define CGEMM_DEFAULT_Q 224
 #define ZGEMM_DEFAULT_Q 112
 
diff --git a/test/Makefile b/test/Makefile
index 715842b4d..56acf1c5b 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -260,6 +260,7 @@ endif
 
 
 FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
+CLDFLAGS = $(CFLAGS) $(LDFLAGS)
 
 
 ifeq ($(USE_OPENMP), 1)
@@ -331,7 +332,7 @@ endif
 
 ifeq ($(BUILD_BFLOAT16),1)
 test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+	$(CC) $(CLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
 endif
 
 ifeq ($(BUILD_COMPLEX),1)